src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 bool
 223 fs_inst::equals(fs_inst *inst)
 224 {
 225    return (opcode == inst->opcode &&
 226            dst.equals(inst->dst) &&
 227            src[0].equals(inst->src[0]) &&
 228            src[1].equals(inst->src[1]) &&
 229            src[2].equals(inst->src[2]) &&
 230            saturate == inst->saturate &&
 231            predicate == inst->predicate &&
 232            conditional_mod == inst->conditional_mod &&
 233            mlen == inst->mlen &&
 234            base_mrf == inst->base_mrf &&
 235            sampler == inst->sampler &&
 236            target == inst->target &&
 237            eot == inst->eot &&
 238            header_present == inst->header_present &&
 239            shadow_compare == inst->shadow_compare &&
 240            offset == inst->offset);
 241 }
 242
 243 int
 244 fs_inst::regs_written()
 245 {
 246    if (is_tex())
 247       return 4;
 248
 249    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 250     * but we don't currently use them...nor do we have an opcode for them.
 251     */
 252
 253    return 1;
 254 }
 255
 256 bool
 257 fs_inst::overwrites_reg(const fs_reg &reg)
 258 {
 259    return (reg.file == dst.file &&
 260            reg.reg == dst.reg &&
 261            reg.reg_offset >= dst.reg_offset  &&
 262            reg.reg_offset < dst.reg_offset + regs_written());
 263 }
 264
 265 bool
 266 fs_inst::is_tex()
 267 {
 268    return (opcode == SHADER_OPCODE_TEX ||
 269            opcode == FS_OPCODE_TXB ||
 270            opcode == SHADER_OPCODE_TXD ||
 271            opcode == SHADER_OPCODE_TXF ||
 272            opcode == SHADER_OPCODE_TXL ||
 273            opcode == SHADER_OPCODE_TXS);
 274 }
 275
 276 bool
 277 fs_inst::is_math()
 278 {
 279    return (opcode == SHADER_OPCODE_RCP ||
 280            opcode == SHADER_OPCODE_RSQ ||
 281            opcode == SHADER_OPCODE_SQRT ||
 282            opcode == SHADER_OPCODE_EXP2 ||
 283            opcode == SHADER_OPCODE_LOG2 ||
 284            opcode == SHADER_OPCODE_SIN ||
 285            opcode == SHADER_OPCODE_COS ||
 286            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 287            opcode == SHADER_OPCODE_INT_REMAINDER ||
 288            opcode == SHADER_OPCODE_POW);
 289 }
 290
 291 void
 292 fs_reg::init()
 293 {
 294    memset(this, 0, sizeof(*this));
 295    this->smear = -1;
 296 }
 297
 298 /** Generic unset register constructor. */
 299 fs_reg::fs_reg()
 300 {
 301    init();
 302    this->file = BAD_FILE;
 303 }
 304
 305 /** Immediate value constructor. */
 306 fs_reg::fs_reg(float f)
 307 {
 308    init();
 309    this->file = IMM;
 310    this->type = BRW_REGISTER_TYPE_F;
 311    this->imm.f = f;
 312 }
 313
 314 /** Immediate value constructor. */
 315 fs_reg::fs_reg(int32_t i)
 316 {
 317    init();
 318    this->file = IMM;
 319    this->type = BRW_REGISTER_TYPE_D;
 320    this->imm.i = i;
 321 }
 322
 323 /** Immediate value constructor. */
 324 fs_reg::fs_reg(uint32_t u)
 325 {
 326    init();
 327    this->file = IMM;
 328    this->type = BRW_REGISTER_TYPE_UD;
 329    this->imm.u = u;
 330 }
 331
 332 /** Fixed brw_reg Immediate value constructor. */
 333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 334 {
 335    init();
 336    this->file = FIXED_HW_REG;
 337    this->fixed_hw_reg = fixed_hw_reg;
 338    this->type = fixed_hw_reg.type;
 339 }
 340
 341 bool
 342 fs_reg::equals(const fs_reg &r) const
 343 {
 344    return (file == r.file &&
 345            reg == r.reg &&
 346            reg_offset == r.reg_offset &&
 347            type == r.type &&
 348            negate == r.negate &&
 349            abs == r.abs &&
 350            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 351                   sizeof(fixed_hw_reg)) == 0 &&
 352            smear == r.smear &&
 353            imm.u == r.imm.u);
 354 }
 355
 356 bool
 357 fs_reg::is_zero() const
 358 {
 359    if (file != IMM)
 360       return false;
 361
 362    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 363 }
 364
 365 bool
 366 fs_reg::is_one() const
 367 {
 368    if (file != IMM)
 369       return false;
 370
 371    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 372 }
 373
 374 int
 375 fs_visitor::type_size(const struct glsl_type *type)
 376 {
 377    unsigned int size, i;
 378
 379    switch (type->base_type) {
 380    case GLSL_TYPE_UINT:
 381    case GLSL_TYPE_INT:
 382    case GLSL_TYPE_FLOAT:
 383    case GLSL_TYPE_BOOL:
 384       return type->components();
 385    case GLSL_TYPE_ARRAY:
 386       return type_size(type->fields.array) * type->length;
 387    case GLSL_TYPE_STRUCT:
 388       size = 0;
 389       for (i = 0; i < type->length; i++) {
 390          size += type_size(type->fields.structure[i].type);
 391       }
 392       return size;
 393    case GLSL_TYPE_SAMPLER:
 394       /* Samplers take up no register space, since they're baked in at
 395        * link time.
 396        */
 397       return 0;
 398    default:
 399       assert(!"not reached");
 400       return 0;
 401    }
 402 }
 403
 404 void
 405 fs_visitor::fail(const char *format, ...)
 406 {
 407    va_list va;
 408    char *msg;
 409
 410    if (failed)
 411       return;
 412
 413    failed = true;
 414
 415    va_start(va, format);
 416    msg = ralloc_vasprintf(mem_ctx, format, va);
 417    va_end(va);
 418    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 419
 420    this->fail_msg = msg;
 421
 422    if (INTEL_DEBUG & DEBUG_WM) {
 423       fprintf(stderr, "%s",  msg);
 424    }
 425 }
 426
 427 fs_inst *
 428 fs_visitor::emit(enum opcode opcode)
 429 {
 430    return emit(fs_inst(opcode));
 431 }
 432
 433 fs_inst *
 434 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 435 {
 436    return emit(fs_inst(opcode, dst));
 437 }
 438
 439 fs_inst *
 440 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 441 {
 442    return emit(fs_inst(opcode, dst, src0));
 443 }
 444
 445 fs_inst *
 446 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 447 {
 448    return emit(fs_inst(opcode, dst, src0, src1));
 449 }
 450
 451 fs_inst *
 452 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 453                  fs_reg src0, fs_reg src1, fs_reg src2)
 454 {
 455    return emit(fs_inst(opcode, dst, src0, src1, src2));
 456 }
 457
 458 void
 459 fs_visitor::push_force_uncompressed()
 460 {
 461    force_uncompressed_stack++;
 462 }
 463
 464 void
 465 fs_visitor::pop_force_uncompressed()
 466 {
 467    force_uncompressed_stack--;
 468    assert(force_uncompressed_stack >= 0);
 469 }
 470
 471 void
 472 fs_visitor::push_force_sechalf()
 473 {
 474    force_sechalf_stack++;
 475 }
 476
 477 void
 478 fs_visitor::pop_force_sechalf()
 479 {
 480    force_sechalf_stack--;
 481    assert(force_sechalf_stack >= 0);
 482 }
 483
 484 /**
 485  * Returns how many MRFs an FS opcode will write over.
 486  *
 487  * Note that this is not the 0 or 1 implied writes in an actual gen
 488  * instruction -- the FS opcodes often generate MOVs in addition.
 489  */
 490 int
 491 fs_visitor::implied_mrf_writes(fs_inst *inst)
 492 {
 493    if (inst->mlen == 0)
 494       return 0;
 495
 496    switch (inst->opcode) {
 497    case SHADER_OPCODE_RCP:
 498    case SHADER_OPCODE_RSQ:
 499    case SHADER_OPCODE_SQRT:
 500    case SHADER_OPCODE_EXP2:
 501    case SHADER_OPCODE_LOG2:
 502    case SHADER_OPCODE_SIN:
 503    case SHADER_OPCODE_COS:
 504       return 1 * dispatch_width / 8;
 505    case SHADER_OPCODE_POW:
 506    case SHADER_OPCODE_INT_QUOTIENT:
 507    case SHADER_OPCODE_INT_REMAINDER:
 508       return 2 * dispatch_width / 8;
 509    case SHADER_OPCODE_TEX:
 510    case FS_OPCODE_TXB:
 511    case SHADER_OPCODE_TXD:
 512    case SHADER_OPCODE_TXF:
 513    case SHADER_OPCODE_TXL:
 514    case SHADER_OPCODE_TXS:
 515       return 1;
 516    case FS_OPCODE_FB_WRITE:
 517       return 2;
 518    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 519    case FS_OPCODE_UNSPILL:
 520       return 1;
 521    case FS_OPCODE_SPILL:
 522       return 2;
 523    default:
 524       assert(!"not reached");
 525       return inst->mlen;
 526    }
 527 }
 528
 529 int
 530 fs_visitor::virtual_grf_alloc(int size)
 531 {
 532    if (virtual_grf_array_size <= virtual_grf_count) {
 533       if (virtual_grf_array_size == 0)
 534          virtual_grf_array_size = 16;
 535       else
 536          virtual_grf_array_size *= 2;
 537       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 538                                    virtual_grf_array_size);
 539    }
 540    virtual_grf_sizes[virtual_grf_count] = size;
 541    return virtual_grf_count++;
 542 }
 543
 544 /** Fixed HW reg constructor. */
 545 fs_reg::fs_reg(enum register_file file, int reg)
 546 {
 547    init();
 548    this->file = file;
 549    this->reg = reg;
 550    this->type = BRW_REGISTER_TYPE_F;
 551 }
 552
 553 /** Fixed HW reg constructor. */
 554 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 555 {
 556    init();
 557    this->file = file;
 558    this->reg = reg;
 559    this->type = type;
 560 }
 561
 562 /** Automatic reg constructor. */
 563 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 564 {
 565    init();
 566
 567    this->file = GRF;
 568    this->reg = v->virtual_grf_alloc(v->type_size(type));
 569    this->reg_offset = 0;
 570    this->type = brw_type_for_base_type(type);
 571 }
 572
 573 fs_reg *
 574 fs_visitor::variable_storage(ir_variable *var)
 575 {
 576    return (fs_reg *)hash_table_find(this->variable_ht, var);
 577 }
 578
 579 void
 580 import_uniforms_callback(const void *key,
 581                          void *data,
 582                          void *closure)
 583 {
 584    struct hash_table *dst_ht = (struct hash_table *)closure;
 585    const fs_reg *reg = (const fs_reg *)data;
 586
 587    if (reg->file != UNIFORM)
 588       return;
 589
 590    hash_table_insert(dst_ht, data, key);
 591 }
 592
 593 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 594  * This brings in those uniform definitions
 595  */
 596 void
 597 fs_visitor::import_uniforms(fs_visitor *v)
 598 {
 599    hash_table_call_foreach(v->variable_ht,
 600                            import_uniforms_callback,
 601                            variable_ht);
 602    this->params_remap = v->params_remap;
 603 }
 604
 605 /* Our support for uniforms is piggy-backed on the struct
 606  * gl_fragment_program, because that's where the values actually
 607  * get stored, rather than in some global gl_shader_program uniform
 608  * store.
 609  */
 610 int
 611 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 612 {
 613    unsigned int offset = 0;
 614
 615    if (type->is_matrix()) {
 616       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 617                                                         type->vector_elements,
 618                                                         1);
 619
 620       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 621          offset += setup_uniform_values(loc + offset, column);
 622       }
 623
 624       return offset;
 625    }
 626
 627    switch (type->base_type) {
 628    case GLSL_TYPE_FLOAT:
 629    case GLSL_TYPE_UINT:
 630    case GLSL_TYPE_INT:
 631    case GLSL_TYPE_BOOL:
 632       for (unsigned int i = 0; i < type->vector_elements; i++) {
 633          unsigned int param = c->prog_data.nr_params++;
 634
 635          this->param_index[param] = loc;
 636          this->param_offset[param] = i;
 637       }
 638       return 1;
 639
 640    case GLSL_TYPE_STRUCT:
 641       for (unsigned int i = 0; i < type->length; i++) {
 642          offset += setup_uniform_values(loc + offset,
 643                                         type->fields.structure[i].type);
 644       }
 645       return offset;
 646
 647    case GLSL_TYPE_ARRAY:
 648       for (unsigned int i = 0; i < type->length; i++) {
 649          offset += setup_uniform_values(loc + offset, type->fields.array);
 650       }
 651       return offset;
 652
 653    case GLSL_TYPE_SAMPLER:
 654       /* The sampler takes up a slot, but we don't use any values from it. */
 655       return 1;
 656
 657    default:
 658       assert(!"not reached");
 659       return 0;
 660    }
 661 }
 662
 663
 664 /* Our support for builtin uniforms is even scarier than non-builtin.
 665  * It sits on top of the PROG_STATE_VAR parameters that are
 666  * automatically updated from GL context state.
 667  */
 668 void
 669 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 670 {
 671    const ir_state_slot *const slots = ir->state_slots;
 672    assert(ir->state_slots != NULL);
 673
 674    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 675       /* This state reference has already been setup by ir_to_mesa, but we'll
 676        * get the same index back here.
 677        */
 678       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 679                                             (gl_state_index *)slots[i].tokens);
 680
 681       /* Add each of the unique swizzles of the element as a parameter.
 682        * This'll end up matching the expected layout of the
 683        * array/matrix/structure we're trying to fill in.
 684        */
 685       int last_swiz = -1;
 686       for (unsigned int j = 0; j < 4; j++) {
 687          int swiz = GET_SWZ(slots[i].swizzle, j);
 688          if (swiz == last_swiz)
 689             break;
 690          last_swiz = swiz;
 691
 692          this->param_index[c->prog_data.nr_params] = index;
 693          this->param_offset[c->prog_data.nr_params] = swiz;
 694          c->prog_data.nr_params++;
 695       }
 696    }
 697 }
 698
 699 fs_reg *
 700 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 701 {
 702    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 703    fs_reg wpos = *reg;
 704    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 705
 706    /* gl_FragCoord.x */
 707    if (ir->pixel_center_integer) {
 708       emit(MOV(wpos, this->pixel_x));
 709    } else {
 710       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 711    }
 712    wpos.reg_offset++;
 713
 714    /* gl_FragCoord.y */
 715    if (!flip && ir->pixel_center_integer) {
 716       emit(MOV(wpos, this->pixel_y));
 717    } else {
 718       fs_reg pixel_y = this->pixel_y;
 719       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 720
 721       if (flip) {
 722          pixel_y.negate = true;
 723          offset += c->key.drawable_height - 1.0;
 724       }
 725
 726       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 727    }
 728    wpos.reg_offset++;
 729
 730    /* gl_FragCoord.z */
 731    if (intel->gen >= 6) {
 732       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 733    } else {
 734       emit(FS_OPCODE_LINTERP, wpos,
 735            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 736            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 737            interp_reg(FRAG_ATTRIB_WPOS, 2));
 738    }
 739    wpos.reg_offset++;
 740
 741    /* gl_FragCoord.w: Already set up in emit_interpolation */
 742    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 743
 744    return reg;
 745 }
 746
 747 fs_inst *
 748 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 749                          glsl_interp_qualifier interpolation_mode,
 750                          bool is_centroid)
 751 {
 752    brw_wm_barycentric_interp_mode barycoord_mode;
 753    if (is_centroid) {
 754       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 755          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 756       else
 757          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 758    } else {
 759       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 760          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 761       else
 762          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 763    }
 764    return emit(FS_OPCODE_LINTERP, attr,
 765                this->delta_x[barycoord_mode],
 766                this->delta_y[barycoord_mode], interp);
 767 }
 768
 769 fs_reg *
 770 fs_visitor::emit_general_interpolation(ir_variable *ir)
 771 {
 772    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 773    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 774    fs_reg attr = *reg;
 775
 776    unsigned int array_elements;
 777    const glsl_type *type;
 778
 779    if (ir->type->is_array()) {
 780       array_elements = ir->type->length;
 781       if (array_elements == 0) {
 782          fail("dereferenced array '%s' has length 0\n", ir->name);
 783       }
 784       type = ir->type->fields.array;
 785    } else {
 786       array_elements = 1;
 787       type = ir->type;
 788    }
 789
 790    glsl_interp_qualifier interpolation_mode =
 791       ir->determine_interpolation_mode(c->key.flat_shade);
 792
 793    int location = ir->location;
 794    for (unsigned int i = 0; i < array_elements; i++) {
 795       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 796          if (urb_setup[location] == -1) {
 797             /* If there's no incoming setup data for this slot, don't
 798              * emit interpolation for it.
 799              */
 800             attr.reg_offset += type->vector_elements;
 801             location++;
 802             continue;
 803          }
 804
 805          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 806             /* Constant interpolation (flat shading) case. The SF has
 807              * handed us defined values in only the constant offset
 808              * field of the setup reg.
 809              */
 810             for (unsigned int k = 0; k < type->vector_elements; k++) {
 811                struct brw_reg interp = interp_reg(location, k);
 812                interp = suboffset(interp, 3);
 813                interp.type = reg->type;
 814                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 815                attr.reg_offset++;
 816             }
 817          } else {
 818             /* Smooth/noperspective interpolation case. */
 819             for (unsigned int k = 0; k < type->vector_elements; k++) {
 820                /* FINISHME: At some point we probably want to push
 821                 * this farther by giving similar treatment to the
 822                 * other potentially constant components of the
 823                 * attribute, as well as making brw_vs_constval.c
 824                 * handle varyings other than gl_TexCoord.
 825                 */
 826                if (location >= FRAG_ATTRIB_TEX0 &&
 827                    location <= FRAG_ATTRIB_TEX7 &&
 828                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
 829                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
 830                } else {
 831                   struct brw_reg interp = interp_reg(location, k);
 832                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
 833                                ir->centroid);
 834                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
 835                      /* Get the pixel/sample mask into f0 so that we know
 836                       * which pixels are lit.  Then, for each channel that is
 837                       * unlit, replace the centroid data with non-centroid
 838                       * data.
 839                       */
 840                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
 841                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
 842                                                   interpolation_mode, false);
 843                      inst->predicate = BRW_PREDICATE_NORMAL;
 844                      inst->predicate_inverse = true;
 845                   }
 846                   if (intel->gen < 6) {
 847                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 848                   }
 849                }
 850                attr.reg_offset++;
 851             }
 852
 853          }
 854          location++;
 855       }
 856    }
 857
 858    return reg;
 859 }
 860
 861 fs_reg *
 862 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 863 {
 864    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 865
 866    /* The frontfacing comes in as a bit in the thread payload. */
 867    if (intel->gen >= 6) {
 868       emit(BRW_OPCODE_ASR, *reg,
 869            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 870            fs_reg(15));
 871       emit(BRW_OPCODE_NOT, *reg, *reg);
 872       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 873    } else {
 874       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 875       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 876        * us front face
 877        */
 878       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
 879       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 880    }
 881
 882    return reg;
 883 }
 884
 885 fs_inst *
 886 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
 887 {
 888    switch (opcode) {
 889    case SHADER_OPCODE_RCP:
 890    case SHADER_OPCODE_RSQ:
 891    case SHADER_OPCODE_SQRT:
 892    case SHADER_OPCODE_EXP2:
 893    case SHADER_OPCODE_LOG2:
 894    case SHADER_OPCODE_SIN:
 895    case SHADER_OPCODE_COS:
 896       break;
 897    default:
 898       assert(!"not reached: bad math opcode");
 899       return NULL;
 900    }
 901
 902    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 903     * might be able to do better by doing execsize = 1 math and then
 904     * expanding that result out, but we would need to be careful with
 905     * masking.
 906     *
 907     * Gen 6 hardware ignores source modifiers (negate and abs) on math
 908     * instructions, so we also move to a temp to set those up.
 909     */
 910    if (intel->gen == 6 && (src.file == UNIFORM ||
 911                            src.abs ||
 912                            src.negate)) {
 913       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 914       emit(BRW_OPCODE_MOV, expanded, src);
 915       src = expanded;
 916    }
 917
 918    fs_inst *inst = emit(opcode, dst, src);
 919
 920    if (intel->gen < 6) {
 921       inst->base_mrf = 2;
 922       inst->mlen = dispatch_width / 8;
 923    }
 924
 925    return inst;
 926 }
 927
 928 fs_inst *
 929 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 930 {
 931    int base_mrf = 2;
 932    fs_inst *inst;
 933
 934    switch (opcode) {
 935    case SHADER_OPCODE_POW:
 936    case SHADER_OPCODE_INT_QUOTIENT:
 937    case SHADER_OPCODE_INT_REMAINDER:
 938       break;
 939    default:
 940       assert(!"not reached: unsupported binary math opcode.");
 941       return NULL;
 942    }
 943
 944    if (intel->gen >= 7) {
 945       inst = emit(opcode, dst, src0, src1);
 946    } else if (intel->gen == 6) {
 947       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 948        *
 949        * The hardware ignores source modifiers (negate and abs) on math
 950        * instructions, so we also move to a temp to set those up.
 951        */
 952       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 953          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 954          expanded.type = src0.type;
 955          emit(BRW_OPCODE_MOV, expanded, src0);
 956          src0 = expanded;
 957       }
 958
 959       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 960          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 961          expanded.type = src1.type;
 962          emit(BRW_OPCODE_MOV, expanded, src1);
 963          src1 = expanded;
 964       }
 965
 966       inst = emit(opcode, dst, src0, src1);
 967    } else {
 968       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 969        * "Message Payload":
 970        *
 971        * "Operand0[7].  For the INT DIV functions, this operand is the
 972        *  denominator."
 973        *  ...
 974        * "Operand1[7].  For the INT DIV functions, this operand is the
 975        *  numerator."
 976        */
 977       bool is_int_div = opcode != SHADER_OPCODE_POW;
 978       fs_reg &op0 = is_int_div ? src1 : src0;
 979       fs_reg &op1 = is_int_div ? src0 : src1;
 980
 981       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
 982       inst = emit(opcode, dst, op0, reg_null_f);
 983
 984       inst->base_mrf = base_mrf;
 985       inst->mlen = 2 * dispatch_width / 8;
 986    }
 987    return inst;
 988 }
 989
 990 /**
 991  * To be called after the last _mesa_add_state_reference() call, to
 992  * set up prog_data.param[] for assign_curb_setup() and
 993  * setup_pull_constants().
 994  */
 995 void
 996 fs_visitor::setup_paramvalues_refs()
 997 {
 998    if (dispatch_width != 8)
 999       return;
1000
1001    /* Set up the pointers to ParamValues now that that array is finalized. */
1002    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1003       c->prog_data.param[i] =
1004          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1005          this->param_offset[i];
1006    }
1007 }
1008
1009 void
1010 fs_visitor::assign_curb_setup()
1011 {
1012    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1013    if (dispatch_width == 8) {
1014       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1015    } else {
1016       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1017    }
1018
1019    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1020    foreach_list(node, &this->instructions) {
1021       fs_inst *inst = (fs_inst *)node;
1022
1023       for (unsigned int i = 0; i < 3; i++) {
1024          if (inst->src[i].file == UNIFORM) {
1025             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1026             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1027                                                   constant_nr / 8,
1028                                                   constant_nr % 8);
1029
1030             inst->src[i].file = FIXED_HW_REG;
1031             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1032          }
1033       }
1034    }
1035 }
1036
1037 void
1038 fs_visitor::calculate_urb_setup()
1039 {
1040    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1041       urb_setup[i] = -1;
1042    }
1043
1044    int urb_next = 0;
1045    /* Figure out where each of the incoming setup attributes lands. */
1046    if (intel->gen >= 6) {
1047       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1048          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1049             urb_setup[i] = urb_next++;
1050          }
1051       }
1052    } else {
1053       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1054       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1055          /* Point size is packed into the header, not as a general attribute */
1056          if (i == VERT_RESULT_PSIZ)
1057             continue;
1058
1059          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1060             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1061
1062             /* The back color slot is skipped when the front color is
1063              * also written to.  In addition, some slots can be
1064              * written in the vertex shader and not read in the
1065              * fragment shader.  So the register number must always be
1066              * incremented, mapped or not.
1067              */
1068             if (fp_index >= 0)
1069                urb_setup[fp_index] = urb_next;
1070             urb_next++;
1071          }
1072       }
1073
1074       /*
1075        * It's a FS only attribute, and we did interpolation for this attribute
1076        * in SF thread. So, count it here, too.
1077        *
1078        * See compile_sf_prog() for more info.
1079        */
1080       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1081          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1082    }
1083
1084    /* Each attribute is 4 setup channels, each of which is half a reg. */
1085    c->prog_data.urb_read_length = urb_next * 2;
1086 }
1087
1088 void
1089 fs_visitor::assign_urb_setup()
1090 {
1091    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1092
1093    /* Offset all the urb_setup[] index by the actual position of the
1094     * setup regs, now that the location of the constants has been chosen.
1095     */
1096    foreach_list(node, &this->instructions) {
1097       fs_inst *inst = (fs_inst *)node;
1098
1099       if (inst->opcode == FS_OPCODE_LINTERP) {
1100          assert(inst->src[2].file == FIXED_HW_REG);
1101          inst->src[2].fixed_hw_reg.nr += urb_start;
1102       }
1103
1104       if (inst->opcode == FS_OPCODE_CINTERP) {
1105          assert(inst->src[0].file == FIXED_HW_REG);
1106          inst->src[0].fixed_hw_reg.nr += urb_start;
1107       }
1108    }
1109
1110    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1111 }
1112
1113 /**
1114  * Split large virtual GRFs into separate components if we can.
1115  *
1116  * This is mostly duplicated with what brw_fs_vector_splitting does,
1117  * but that's really conservative because it's afraid of doing
1118  * splitting that doesn't result in real progress after the rest of
1119  * the optimization phases, which would cause infinite looping in
1120  * optimization.  We can do it once here, safely.  This also has the
1121  * opportunity to split interpolated values, or maybe even uniforms,
1122  * which we don't have at the IR level.
1123  *
1124  * We want to split, because virtual GRFs are what we register
1125  * allocate and spill (due to contiguousness requirements for some
1126  * instructions), and they're what we naturally generate in the
1127  * codegen process, but most virtual GRFs don't actually need to be
1128  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1129  * live intervals and better dead code elimination and coalescing.
1130  */
1131 void
1132 fs_visitor::split_virtual_grfs()
1133 {
1134    int num_vars = this->virtual_grf_count;
1135    bool split_grf[num_vars];
1136    int new_virtual_grf[num_vars];
1137
1138    /* Try to split anything > 0 sized. */
1139    for (int i = 0; i < num_vars; i++) {
1140       if (this->virtual_grf_sizes[i] != 1)
1141          split_grf[i] = true;
1142       else
1143          split_grf[i] = false;
1144    }
1145
1146    if (brw->has_pln &&
1147        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1148       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1149        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1150        * Gen6, that was the only supported interpolation mode, and since Gen6,
1151        * delta_x and delta_y are in fixed hardware registers.
1152        */
1153       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1154          false;
1155    }
1156
1157    foreach_list(node, &this->instructions) {
1158       fs_inst *inst = (fs_inst *)node;
1159
1160       /* If there's a SEND message that requires contiguous destination
1161        * registers, no splitting is allowed.
1162        */
1163       if (inst->regs_written() > 1) {
1164          split_grf[inst->dst.reg] = false;
1165       }
1166    }
1167
1168    /* Allocate new space for split regs.  Note that the virtual
1169     * numbers will be contiguous.
1170     */
1171    for (int i = 0; i < num_vars; i++) {
1172       if (split_grf[i]) {
1173          new_virtual_grf[i] = virtual_grf_alloc(1);
1174          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1175             int reg = virtual_grf_alloc(1);
1176             assert(reg == new_virtual_grf[i] + j - 1);
1177             (void) reg;
1178          }
1179          this->virtual_grf_sizes[i] = 1;
1180       }
1181    }
1182
1183    foreach_list(node, &this->instructions) {
1184       fs_inst *inst = (fs_inst *)node;
1185
1186       if (inst->dst.file == GRF &&
1187           split_grf[inst->dst.reg] &&
1188           inst->dst.reg_offset != 0) {
1189          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1190                           inst->dst.reg_offset - 1);
1191          inst->dst.reg_offset = 0;
1192       }
1193       for (int i = 0; i < 3; i++) {
1194          if (inst->src[i].file == GRF &&
1195              split_grf[inst->src[i].reg] &&
1196              inst->src[i].reg_offset != 0) {
1197             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1198                                 inst->src[i].reg_offset - 1);
1199             inst->src[i].reg_offset = 0;
1200          }
1201       }
1202    }
1203    this->live_intervals_valid = false;
1204 }
1205
1206 /**
1207  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1208  *
1209  * During code generation, we create tons of temporary variables, many of
1210  * which get immediately killed and are never used again.  Yet, in later
1211  * optimization and analysis passes, such as compute_live_intervals, we need
1212  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1213  * overhead.
1214  */
1215 void
1216 fs_visitor::compact_virtual_grfs()
1217 {
1218    /* Mark which virtual GRFs are used, and count how many. */
1219    int remap_table[this->virtual_grf_count];
1220    memset(remap_table, -1, sizeof(remap_table));
1221
1222    foreach_list(node, &this->instructions) {
1223       const fs_inst *inst = (const fs_inst *) node;
1224
1225       if (inst->dst.file == GRF)
1226          remap_table[inst->dst.reg] = 0;
1227
1228       for (int i = 0; i < 3; i++) {
1229          if (inst->src[i].file == GRF)
1230             remap_table[inst->src[i].reg] = 0;
1231       }
1232    }
1233
1234    /* In addition to registers used in instructions, fs_visitor keeps
1235     * direct references to certain special values which must be patched:
1236     */
1237    fs_reg *special[] = {
1238       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1239       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1240       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1241       &delta_x[0], &delta_x[1], &delta_x[2],
1242       &delta_x[3], &delta_x[4], &delta_x[5],
1243       &delta_y[0], &delta_y[1], &delta_y[2],
1244       &delta_y[3], &delta_y[4], &delta_y[5],
1245    };
1246    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1247    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1248
1249    /* Treat all special values as used, to be conservative */
1250    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1251       if (special[i]->file == GRF)
1252          remap_table[special[i]->reg] = 0;
1253    }
1254
1255    /* Compact the GRF arrays. */
1256    int new_index = 0;
1257    for (int i = 0; i < this->virtual_grf_count; i++) {
1258       if (remap_table[i] != -1) {
1259          remap_table[i] = new_index;
1260          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1261          if (live_intervals_valid) {
1262             virtual_grf_use[new_index] = virtual_grf_use[i];
1263             virtual_grf_def[new_index] = virtual_grf_def[i];
1264          }
1265          ++new_index;
1266       }
1267    }
1268
1269    this->virtual_grf_count = new_index;
1270
1271    /* Patch all the instructions to use the newly renumbered registers */
1272    foreach_list(node, &this->instructions) {
1273       fs_inst *inst = (fs_inst *) node;
1274
1275       if (inst->dst.file == GRF)
1276          inst->dst.reg = remap_table[inst->dst.reg];
1277
1278       for (int i = 0; i < 3; i++) {
1279          if (inst->src[i].file == GRF)
1280             inst->src[i].reg = remap_table[inst->src[i].reg];
1281       }
1282    }
1283
1284    /* Patch all the references to special values */
1285    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1286       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1287          special[i]->reg = remap_table[special[i]->reg];
1288    }
1289 }
1290
1291 bool
1292 fs_visitor::remove_dead_constants()
1293 {
1294    if (dispatch_width == 8) {
1295       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1296
1297       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1298          this->params_remap[i] = -1;
1299
1300       /* Find which params are still in use. */
1301       foreach_list(node, &this->instructions) {
1302          fs_inst *inst = (fs_inst *)node;
1303
1304          for (int i = 0; i < 3; i++) {
1305             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1306
1307             if (inst->src[i].file != UNIFORM)
1308                continue;
1309
1310             assert(constant_nr < (int)c->prog_data.nr_params);
1311
1312             /* For now, set this to non-negative.  We'll give it the
1313              * actual new number in a moment, in order to keep the
1314              * register numbers nicely ordered.
1315              */
1316             this->params_remap[constant_nr] = 0;
1317          }
1318       }
1319
1320       /* Figure out what the new numbers for the params will be.  At some
1321        * point when we're doing uniform array access, we're going to want
1322        * to keep the distinction between .reg and .reg_offset, but for
1323        * now we don't care.
1324        */
1325       unsigned int new_nr_params = 0;
1326       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1327          if (this->params_remap[i] != -1) {
1328             this->params_remap[i] = new_nr_params++;
1329          }
1330       }
1331
1332       /* Update the list of params to be uploaded to match our new numbering. */
1333       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1334          int remapped = this->params_remap[i];
1335
1336          if (remapped == -1)
1337             continue;
1338
1339          /* We've already done setup_paramvalues_refs() so no need to worry
1340           * about param_index and param_offset.
1341           */
1342          c->prog_data.param[remapped] = c->prog_data.param[i];
1343       }
1344
1345       c->prog_data.nr_params = new_nr_params;
1346    } else {
1347       /* This should have been generated in the 8-wide pass already. */
1348       assert(this->params_remap);
1349    }
1350
1351    /* Now do the renumbering of the shader to remove unused params. */
1352    foreach_list(node, &this->instructions) {
1353       fs_inst *inst = (fs_inst *)node;
1354
1355       for (int i = 0; i < 3; i++) {
1356          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1357
1358          if (inst->src[i].file != UNIFORM)
1359             continue;
1360
1361          assert(this->params_remap[constant_nr] != -1);
1362          inst->src[i].reg = this->params_remap[constant_nr];
1363          inst->src[i].reg_offset = 0;
1364       }
1365    }
1366
1367    return true;
1368 }
1369
1370 /**
1371  * Choose accesses from the UNIFORM file to demote to using the pull
1372  * constant buffer.
1373  *
1374  * We allow a fragment shader to have more than the specified minimum
1375  * maximum number of fragment shader uniform components (64).  If
1376  * there are too many of these, they'd fill up all of register space.
1377  * So, this will push some of them out to the pull constant buffer and
1378  * update the program to load them.
1379  */
1380 void
1381 fs_visitor::setup_pull_constants()
1382 {
1383    /* Only allow 16 registers (128 uniform components) as push constants. */
1384    unsigned int max_uniform_components = 16 * 8;
1385    if (c->prog_data.nr_params <= max_uniform_components)
1386       return;
1387
1388    if (dispatch_width == 16) {
1389       fail("Pull constants not supported in 16-wide\n");
1390       return;
1391    }
1392
1393    /* Just demote the end of the list.  We could probably do better
1394     * here, demoting things that are rarely used in the program first.
1395     */
1396    int pull_uniform_base = max_uniform_components;
1397    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1398
1399    foreach_list(node, &this->instructions) {
1400       fs_inst *inst = (fs_inst *)node;
1401
1402       for (int i = 0; i < 3; i++) {
1403          if (inst->src[i].file != UNIFORM)
1404             continue;
1405
1406          int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1407          if (uniform_nr < pull_uniform_base)
1408             continue;
1409
1410          fs_reg dst = fs_reg(this, glsl_type::float_type);
1411          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1412          fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1413                                              pull_uniform_base) * 4) & ~15));
1414          fs_inst *pull =
1415             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1416                                  dst, index, offset);
1417          pull->ir = inst->ir;
1418          pull->annotation = inst->annotation;
1419          pull->base_mrf = 14;
1420          pull->mlen = 1;
1421
1422          inst->insert_before(pull);
1423
1424          inst->src[i].file = GRF;
1425          inst->src[i].reg = dst.reg;
1426          inst->src[i].reg_offset = 0;
1427          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1428       }
1429    }
1430
1431    for (int i = 0; i < pull_uniform_count; i++) {
1432       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1433    }
1434    c->prog_data.nr_params -= pull_uniform_count;
1435    c->prog_data.nr_pull_params = pull_uniform_count;
1436 }
1437
1438 bool
1439 fs_visitor::opt_algebraic()
1440 {
1441    bool progress = false;
1442
1443    foreach_list(node, &this->instructions) {
1444       fs_inst *inst = (fs_inst *)node;
1445
1446       switch (inst->opcode) {
1447       case BRW_OPCODE_MUL:
1448          if (inst->src[1].file != IMM)
1449             continue;
1450
1451          /* a * 1.0 = a */
1452          if (inst->src[1].is_one()) {
1453             inst->opcode = BRW_OPCODE_MOV;
1454             inst->src[1] = reg_undef;
1455             progress = true;
1456             break;
1457          }
1458
1459          /* a * 0.0 = 0.0 */
1460          if (inst->src[1].is_zero()) {
1461             inst->opcode = BRW_OPCODE_MOV;
1462             inst->src[0] = inst->src[1];
1463             inst->src[1] = reg_undef;
1464             progress = true;
1465             break;
1466          }
1467
1468          break;
1469       case BRW_OPCODE_ADD:
1470          if (inst->src[1].file != IMM)
1471             continue;
1472
1473          /* a + 0.0 = a */
1474          if (inst->src[1].is_zero()) {
1475             inst->opcode = BRW_OPCODE_MOV;
1476             inst->src[1] = reg_undef;
1477             progress = true;
1478             break;
1479          }
1480          break;
1481       default:
1482          break;
1483       }
1484    }
1485
1486    return progress;
1487 }
1488
1489 /**
1490  * Must be called after calculate_live_intervales() to remove unused
1491  * writes to registers -- register allocation will fail otherwise
1492  * because something deffed but not used won't be considered to
1493  * interfere with other regs.
1494  */
1495 bool
1496 fs_visitor::dead_code_eliminate()
1497 {
1498    bool progress = false;
1499    int pc = 0;
1500
1501    calculate_live_intervals();
1502
1503    foreach_list_safe(node, &this->instructions) {
1504       fs_inst *inst = (fs_inst *)node;
1505
1506       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1507          inst->remove();
1508          progress = true;
1509       }
1510
1511       pc++;
1512    }
1513
1514    if (progress)
1515       live_intervals_valid = false;
1516
1517    return progress;
1518 }
1519
1520 /**
1521  * Implements a second type of register coalescing: This one checks if
1522  * the two regs involved in a raw move don't interfere, in which case
1523  * they can both by stored in the same place and the MOV removed.
1524  */
1525 bool
1526 fs_visitor::register_coalesce_2()
1527 {
1528    bool progress = false;
1529
1530    calculate_live_intervals();
1531
1532    foreach_list_safe(node, &this->instructions) {
1533       fs_inst *inst = (fs_inst *)node;
1534
1535       if (inst->opcode != BRW_OPCODE_MOV ||
1536           inst->predicate ||
1537           inst->saturate ||
1538           inst->src[0].file != GRF ||
1539           inst->src[0].negate ||
1540           inst->src[0].abs ||
1541           inst->src[0].smear != -1 ||
1542           inst->dst.file != GRF ||
1543           inst->dst.type != inst->src[0].type ||
1544           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1545           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1546          continue;
1547       }
1548
1549       int reg_from = inst->src[0].reg;
1550       assert(inst->src[0].reg_offset == 0);
1551       int reg_to = inst->dst.reg;
1552       int reg_to_offset = inst->dst.reg_offset;
1553
1554       foreach_list_safe(node, &this->instructions) {
1555          fs_inst *scan_inst = (fs_inst *)node;
1556
1557          if (scan_inst->dst.file == GRF &&
1558              scan_inst->dst.reg == reg_from) {
1559             scan_inst->dst.reg = reg_to;
1560             scan_inst->dst.reg_offset = reg_to_offset;
1561          }
1562          for (int i = 0; i < 3; i++) {
1563             if (scan_inst->src[i].file == GRF &&
1564                 scan_inst->src[i].reg == reg_from) {
1565                scan_inst->src[i].reg = reg_to;
1566                scan_inst->src[i].reg_offset = reg_to_offset;
1567             }
1568          }
1569       }
1570
1571       inst->remove();
1572       live_intervals_valid = false;
1573       progress = true;
1574       continue;
1575    }
1576
1577    return progress;
1578 }
1579
1580 bool
1581 fs_visitor::register_coalesce()
1582 {
1583    bool progress = false;
1584    int if_depth = 0;
1585    int loop_depth = 0;
1586
1587    foreach_list_safe(node, &this->instructions) {
1588       fs_inst *inst = (fs_inst *)node;
1589
1590       /* Make sure that we dominate the instructions we're going to
1591        * scan for interfering with our coalescing, or we won't have
1592        * scanned enough to see if anything interferes with our
1593        * coalescing.  We don't dominate the following instructions if
1594        * we're in a loop or an if block.
1595        */
1596       switch (inst->opcode) {
1597       case BRW_OPCODE_DO:
1598          loop_depth++;
1599          break;
1600       case BRW_OPCODE_WHILE:
1601          loop_depth--;
1602          break;
1603       case BRW_OPCODE_IF:
1604          if_depth++;
1605          break;
1606       case BRW_OPCODE_ENDIF:
1607          if_depth--;
1608          break;
1609       default:
1610          break;
1611       }
1612       if (loop_depth || if_depth)
1613          continue;
1614
1615       if (inst->opcode != BRW_OPCODE_MOV ||
1616           inst->predicate ||
1617           inst->saturate ||
1618           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1619                                     inst->src[0].file != UNIFORM)||
1620           inst->dst.type != inst->src[0].type)
1621          continue;
1622
1623       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1624
1625       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1626        * them: check for no writes to either one until the exit of the
1627        * program.
1628        */
1629       bool interfered = false;
1630
1631       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1632            !scan_inst->is_tail_sentinel();
1633            scan_inst = (fs_inst *)scan_inst->next) {
1634          if (scan_inst->dst.file == GRF) {
1635             if (scan_inst->overwrites_reg(inst->dst) ||
1636                 scan_inst->overwrites_reg(inst->src[0])) {
1637                interfered = true;
1638                break;
1639             }
1640          }
1641
1642          /* The gen6 MATH instruction can't handle source modifiers or
1643           * unusual register regions, so avoid coalescing those for
1644           * now.  We should do something more specific.
1645           */
1646          if (intel->gen >= 6 &&
1647              scan_inst->is_math() &&
1648              (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1649             interfered = true;
1650             break;
1651          }
1652
1653          /* The accumulator result appears to get used for the
1654           * conditional modifier generation.  When negating a UD
1655           * value, there is a 33rd bit generated for the sign in the
1656           * accumulator value, so now you can't check, for example,
1657           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1658           */
1659          if (scan_inst->conditional_mod &&
1660              inst->src[0].negate &&
1661              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1662             interfered = true;
1663             break;
1664          }
1665       }
1666       if (interfered) {
1667          continue;
1668       }
1669
1670       /* Rewrite the later usage to point at the source of the move to
1671        * be removed.
1672        */
1673       for (fs_inst *scan_inst = inst;
1674            !scan_inst->is_tail_sentinel();
1675            scan_inst = (fs_inst *)scan_inst->next) {
1676          for (int i = 0; i < 3; i++) {
1677             if (scan_inst->src[i].file == GRF &&
1678                 scan_inst->src[i].reg == inst->dst.reg &&
1679                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1680                fs_reg new_src = inst->src[0];
1681                if (scan_inst->src[i].abs) {
1682                   new_src.negate = 0;
1683                   new_src.abs = 1;
1684                }
1685                new_src.negate ^= scan_inst->src[i].negate;
1686                scan_inst->src[i] = new_src;
1687             }
1688          }
1689       }
1690
1691       inst->remove();
1692       progress = true;
1693    }
1694
1695    if (progress)
1696       live_intervals_valid = false;
1697
1698    return progress;
1699 }
1700
1701
1702 bool
1703 fs_visitor::compute_to_mrf()
1704 {
1705    bool progress = false;
1706    int next_ip = 0;
1707
1708    calculate_live_intervals();
1709
1710    foreach_list_safe(node, &this->instructions) {
1711       fs_inst *inst = (fs_inst *)node;
1712
1713       int ip = next_ip;
1714       next_ip++;
1715
1716       if (inst->opcode != BRW_OPCODE_MOV ||
1717           inst->predicate ||
1718           inst->dst.file != MRF || inst->src[0].file != GRF ||
1719           inst->dst.type != inst->src[0].type ||
1720           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1721          continue;
1722
1723       /* Work out which hardware MRF registers are written by this
1724        * instruction.
1725        */
1726       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1727       int mrf_high;
1728       if (inst->dst.reg & BRW_MRF_COMPR4) {
1729          mrf_high = mrf_low + 4;
1730       } else if (dispatch_width == 16 &&
1731                  (!inst->force_uncompressed && !inst->force_sechalf)) {
1732          mrf_high = mrf_low + 1;
1733       } else {
1734          mrf_high = mrf_low;
1735       }
1736
1737       /* Can't compute-to-MRF this GRF if someone else was going to
1738        * read it later.
1739        */
1740       if (this->virtual_grf_use[inst->src[0].reg] > ip)
1741          continue;
1742
1743       /* Found a move of a GRF to a MRF.  Let's see if we can go
1744        * rewrite the thing that made this GRF to write into the MRF.
1745        */
1746       fs_inst *scan_inst;
1747       for (scan_inst = (fs_inst *)inst->prev;
1748            scan_inst->prev != NULL;
1749            scan_inst = (fs_inst *)scan_inst->prev) {
1750          if (scan_inst->dst.file == GRF &&
1751              scan_inst->dst.reg == inst->src[0].reg) {
1752             /* Found the last thing to write our reg we want to turn
1753              * into a compute-to-MRF.
1754              */
1755
1756             /* SENDs can only write to GRFs, so no compute-to-MRF. */
1757             if (scan_inst->mlen) {
1758                break;
1759             }
1760
1761             /* If it's predicated, it (probably) didn't populate all
1762              * the channels.  We might be able to rewrite everything
1763              * that writes that reg, but it would require smarter
1764              * tracking to delay the rewriting until complete success.
1765              */
1766             if (scan_inst->predicate)
1767                break;
1768
1769             /* If it's half of register setup and not the same half as
1770              * our MOV we're trying to remove, bail for now.
1771              */
1772             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1773                 scan_inst->force_sechalf != inst->force_sechalf) {
1774                break;
1775             }
1776
1777             /* SEND instructions can't have MRF as a destination. */
1778             if (scan_inst->mlen)
1779                break;
1780
1781             if (intel->gen >= 6) {
1782                /* gen6 math instructions must have the destination be
1783                 * GRF, so no compute-to-MRF for them.
1784                 */
1785                if (scan_inst->is_math()) {
1786                   break;
1787                }
1788             }
1789
1790             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1791                /* Found the creator of our MRF's source value. */
1792                scan_inst->dst.file = MRF;
1793                scan_inst->dst.reg = inst->dst.reg;
1794                scan_inst->saturate |= inst->saturate;
1795                inst->remove();
1796                progress = true;
1797             }
1798             break;
1799          }
1800
1801          /* We don't handle flow control here.  Most computation of
1802           * values that end up in MRFs are shortly before the MRF
1803           * write anyway.
1804           */
1805          if (scan_inst->opcode == BRW_OPCODE_DO ||
1806              scan_inst->opcode == BRW_OPCODE_WHILE ||
1807              scan_inst->opcode == BRW_OPCODE_ELSE ||
1808              scan_inst->opcode == BRW_OPCODE_ENDIF) {
1809             break;
1810          }
1811
1812          /* You can't read from an MRF, so if someone else reads our
1813           * MRF's source GRF that we wanted to rewrite, that stops us.
1814           */
1815          bool interfered = false;
1816          for (int i = 0; i < 3; i++) {
1817             if (scan_inst->src[i].file == GRF &&
1818                 scan_inst->src[i].reg == inst->src[0].reg &&
1819                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1820                interfered = true;
1821             }
1822          }
1823          if (interfered)
1824             break;
1825
1826          if (scan_inst->dst.file == MRF) {
1827             /* If somebody else writes our MRF here, we can't
1828              * compute-to-MRF before that.
1829              */
1830             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1831             int scan_mrf_high;
1832
1833             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1834                scan_mrf_high = scan_mrf_low + 4;
1835             } else if (dispatch_width == 16 &&
1836                        (!scan_inst->force_uncompressed &&
1837                         !scan_inst->force_sechalf)) {
1838                scan_mrf_high = scan_mrf_low + 1;
1839             } else {
1840                scan_mrf_high = scan_mrf_low;
1841             }
1842
1843             if (mrf_low == scan_mrf_low ||
1844                 mrf_low == scan_mrf_high ||
1845                 mrf_high == scan_mrf_low ||
1846                 mrf_high == scan_mrf_high) {
1847                break;
1848             }
1849          }
1850
1851          if (scan_inst->mlen > 0) {
1852             /* Found a SEND instruction, which means that there are
1853              * live values in MRFs from base_mrf to base_mrf +
1854              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1855              * above it.
1856              */
1857             if (mrf_low >= scan_inst->base_mrf &&
1858                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1859                break;
1860             }
1861             if (mrf_high >= scan_inst->base_mrf &&
1862                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1863                break;
1864             }
1865          }
1866       }
1867    }
1868
1869    if (progress)
1870       live_intervals_valid = false;
1871
1872    return progress;
1873 }
1874
1875 /**
1876  * Walks through basic blocks, looking for repeated MRF writes and
1877  * removing the later ones.
1878  */
1879 bool
1880 fs_visitor::remove_duplicate_mrf_writes()
1881 {
1882    fs_inst *last_mrf_move[16];
1883    bool progress = false;
1884
1885    /* Need to update the MRF tracking for compressed instructions. */
1886    if (dispatch_width == 16)
1887       return false;
1888
1889    memset(last_mrf_move, 0, sizeof(last_mrf_move));
1890
1891    foreach_list_safe(node, &this->instructions) {
1892       fs_inst *inst = (fs_inst *)node;
1893
1894       switch (inst->opcode) {
1895       case BRW_OPCODE_DO:
1896       case BRW_OPCODE_WHILE:
1897       case BRW_OPCODE_IF:
1898       case BRW_OPCODE_ELSE:
1899       case BRW_OPCODE_ENDIF:
1900          memset(last_mrf_move, 0, sizeof(last_mrf_move));
1901          continue;
1902       default:
1903          break;
1904       }
1905
1906       if (inst->opcode == BRW_OPCODE_MOV &&
1907           inst->dst.file == MRF) {
1908          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1909          if (prev_inst && inst->equals(prev_inst)) {
1910             inst->remove();
1911             progress = true;
1912             continue;
1913          }
1914       }
1915
1916       /* Clear out the last-write records for MRFs that were overwritten. */
1917       if (inst->dst.file == MRF) {
1918          last_mrf_move[inst->dst.reg] = NULL;
1919       }
1920
1921       if (inst->mlen > 0) {
1922          /* Found a SEND instruction, which will include two or fewer
1923           * implied MRF writes.  We could do better here.
1924           */
1925          for (int i = 0; i < implied_mrf_writes(inst); i++) {
1926             last_mrf_move[inst->base_mrf + i] = NULL;
1927          }
1928       }
1929
1930       /* Clear out any MRF move records whose sources got overwritten. */
1931       if (inst->dst.file == GRF) {
1932          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1933             if (last_mrf_move[i] &&
1934                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1935                last_mrf_move[i] = NULL;
1936             }
1937          }
1938       }
1939
1940       if (inst->opcode == BRW_OPCODE_MOV &&
1941           inst->dst.file == MRF &&
1942           inst->src[0].file == GRF &&
1943           !inst->predicate) {
1944          last_mrf_move[inst->dst.reg] = inst;
1945       }
1946    }
1947
1948    if (progress)
1949       live_intervals_valid = false;
1950
1951    return progress;
1952 }
1953
1954 void
1955 fs_visitor::dump_instruction(fs_inst *inst)
1956 {
1957    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1958        opcode_descs[inst->opcode].name) {
1959       printf("%s", opcode_descs[inst->opcode].name);
1960    } else {
1961       printf("op%d", inst->opcode);
1962    }
1963    if (inst->saturate)
1964       printf(".sat");
1965    printf(" ");
1966
1967    switch (inst->dst.file) {
1968    case GRF:
1969       printf("vgrf%d", inst->dst.reg);
1970       if (inst->dst.reg_offset)
1971          printf("+%d", inst->dst.reg_offset);
1972       break;
1973    case MRF:
1974       printf("m%d", inst->dst.reg);
1975       break;
1976    case BAD_FILE:
1977       printf("(null)");
1978       break;
1979    case UNIFORM:
1980       printf("***u%d***", inst->dst.reg);
1981       break;
1982    default:
1983       printf("???");
1984       break;
1985    }
1986    printf(", ");
1987
1988    for (int i = 0; i < 3; i++) {
1989       if (inst->src[i].negate)
1990          printf("-");
1991       if (inst->src[i].abs)
1992          printf("|");
1993       switch (inst->src[i].file) {
1994       case GRF:
1995          printf("vgrf%d", inst->src[i].reg);
1996          if (inst->src[i].reg_offset)
1997             printf("+%d", inst->src[i].reg_offset);
1998          break;
1999       case MRF:
2000          printf("***m%d***", inst->src[i].reg);
2001          break;
2002       case UNIFORM:
2003          printf("u%d", inst->src[i].reg);
2004          if (inst->src[i].reg_offset)
2005             printf(".%d", inst->src[i].reg_offset);
2006          break;
2007       case BAD_FILE:
2008          printf("(null)");
2009          break;
2010       default:
2011          printf("???");
2012          break;
2013       }
2014       if (inst->src[i].abs)
2015          printf("|");
2016
2017       if (i < 3)
2018          printf(", ");
2019    }
2020
2021    printf(" ");
2022
2023    if (inst->force_uncompressed)
2024       printf("1sthalf ");
2025
2026    if (inst->force_sechalf)
2027       printf("2ndhalf ");
2028
2029    printf("\n");
2030 }
2031
2032 void
2033 fs_visitor::dump_instructions()
2034 {
2035    int ip = 0;
2036    foreach_list(node, &this->instructions) {
2037       fs_inst *inst = (fs_inst *)node;
2038       printf("%d: ", ip++);
2039       dump_instruction(inst);
2040    }
2041 }
2042
2043 /**
2044  * Possibly returns an instruction that set up @param reg.
2045  *
2046  * Sometimes we want to take the result of some expression/variable
2047  * dereference tree and rewrite the instruction generating the result
2048  * of the tree.  When processing the tree, we know that the
2049  * instructions generated are all writing temporaries that are dead
2050  * outside of this tree.  So, if we have some instructions that write
2051  * a temporary, we're free to point that temp write somewhere else.
2052  *
2053  * Note that this doesn't guarantee that the instruction generated
2054  * only reg -- it might be the size=4 destination of a texture instruction.
2055  */
2056 fs_inst *
2057 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2058                                            fs_inst *end,
2059                                            fs_reg reg)
2060 {
2061    if (end == start ||
2062        end->predicate ||
2063        end->force_uncompressed ||
2064        end->force_sechalf ||
2065        !reg.equals(end->dst)) {
2066       return NULL;
2067    } else {
2068       return end;
2069    }
2070 }
2071
2072 void
2073 fs_visitor::setup_payload_gen6()
2074 {
2075    struct intel_context *intel = &brw->intel;
2076    bool uses_depth =
2077       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2078    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2079
2080    assert(intel->gen >= 6);
2081
2082    /* R0-1: masks, pixel X/Y coordinates. */
2083    c->nr_payload_regs = 2;
2084    /* R2: only for 32-pixel dispatch.*/
2085
2086    /* R3-26: barycentric interpolation coordinates.  These appear in the
2087     * same order that they appear in the brw_wm_barycentric_interp_mode
2088     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2089     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2090     * appear if they were enabled using the "Barycentric Interpolation
2091     * Mode" bits in WM_STATE.
2092     */
2093    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2094       if (barycentric_interp_modes & (1 << i)) {
2095          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2096          c->nr_payload_regs += 2;
2097          if (dispatch_width == 16) {
2098             c->nr_payload_regs += 2;
2099          }
2100       }
2101    }
2102
2103    /* R27: interpolated depth if uses source depth */
2104    if (uses_depth) {
2105       c->source_depth_reg = c->nr_payload_regs;
2106       c->nr_payload_regs++;
2107       if (dispatch_width == 16) {
2108          /* R28: interpolated depth if not 8-wide. */
2109          c->nr_payload_regs++;
2110       }
2111    }
2112    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2113    if (uses_depth) {
2114       c->source_w_reg = c->nr_payload_regs;
2115       c->nr_payload_regs++;
2116       if (dispatch_width == 16) {
2117          /* R30: interpolated W if not 8-wide. */
2118          c->nr_payload_regs++;
2119       }
2120    }
2121    /* R31: MSAA position offsets. */
2122    /* R32-: bary for 32-pixel. */
2123    /* R58-59: interp W for 32-pixel. */
2124
2125    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2126       c->source_depth_to_render_target = true;
2127    }
2128 }
2129
2130 bool
2131 fs_visitor::run()
2132 {
2133    uint32_t orig_nr_params = c->prog_data.nr_params;
2134
2135    if (intel->gen >= 6)
2136       setup_payload_gen6();
2137    else
2138       setup_payload_gen4();
2139
2140    if (0) {
2141       emit_dummy_fs();
2142    } else {
2143       calculate_urb_setup();
2144       if (intel->gen < 6)
2145          emit_interpolation_setup_gen4();
2146       else
2147          emit_interpolation_setup_gen6();
2148
2149       /* Generate FS IR for main().  (the visitor only descends into
2150        * functions called "main").
2151        */
2152       if (shader) {
2153          foreach_list(node, &*shader->ir) {
2154             ir_instruction *ir = (ir_instruction *)node;
2155             base_ir = ir;
2156             this->result = reg_undef;
2157             ir->accept(this);
2158          }
2159       } else {
2160          emit_fragment_program_code();
2161       }
2162       base_ir = NULL;
2163       if (failed)
2164          return false;
2165
2166       emit_fb_writes();
2167
2168       split_virtual_grfs();
2169
2170       setup_paramvalues_refs();
2171       setup_pull_constants();
2172
2173       bool progress;
2174       do {
2175          progress = false;
2176
2177          compact_virtual_grfs();
2178
2179          progress = remove_duplicate_mrf_writes() || progress;
2180
2181          progress = opt_algebraic() || progress;
2182          progress = opt_cse() || progress;
2183          progress = opt_copy_propagate() || progress;
2184          progress = dead_code_eliminate() || progress;
2185          progress = register_coalesce() || progress;
2186          progress = register_coalesce_2() || progress;
2187          progress = compute_to_mrf() || progress;
2188       } while (progress);
2189
2190       remove_dead_constants();
2191
2192       schedule_instructions();
2193
2194       assign_curb_setup();
2195       assign_urb_setup();
2196
2197       if (0) {
2198          /* Debug of register spilling: Go spill everything. */
2199          for (int i = 0; i < virtual_grf_count; i++) {
2200             spill_reg(i);
2201          }
2202       }
2203
2204       if (0)
2205          assign_regs_trivial();
2206       else {
2207          while (!assign_regs()) {
2208             if (failed)
2209                break;
2210          }
2211       }
2212    }
2213    assert(force_uncompressed_stack == 0);
2214    assert(force_sechalf_stack == 0);
2215
2216    if (failed)
2217       return false;
2218
2219    if (dispatch_width == 8) {
2220       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2221    } else {
2222       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2223
2224       /* Make sure we didn't try to sneak in an extra uniform */
2225       assert(orig_nr_params == c->prog_data.nr_params);
2226       (void) orig_nr_params;
2227    }
2228
2229    return !failed;
2230 }
2231
2232 const unsigned *
2233 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2234                struct gl_fragment_program *fp,
2235                struct gl_shader_program *prog,
2236                unsigned *final_assembly_size)
2237 {
2238    struct intel_context *intel = &brw->intel;
2239    bool start_busy = false;
2240    float start_time = 0;
2241
2242    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2243       start_busy = (intel->batch.last_bo &&
2244                     drm_intel_bo_busy(intel->batch.last_bo));
2245       start_time = get_time();
2246    }
2247
2248    struct brw_shader *shader = NULL;
2249    if (prog)
2250       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2251
2252    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2253       if (shader) {
2254          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2255          _mesa_print_ir(shader->ir, NULL);
2256          printf("\n\n");
2257       } else {
2258          printf("ARB_fragment_program %d ir for native fragment shader\n",
2259                 fp->Base.Id);
2260          _mesa_print_program(&fp->Base);
2261       }
2262    }
2263
2264    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2265     */
2266    fs_visitor v(brw, c, prog, fp, 8);
2267    if (!v.run()) {
2268       prog->LinkStatus = false;
2269       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2270
2271       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2272                     v.fail_msg);
2273
2274       return NULL;
2275    }
2276
2277    exec_list *simd16_instructions = NULL;
2278    fs_visitor v2(brw, c, prog, fp, 16);
2279    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2280       v2.import_uniforms(&v);
2281       if (!v2.run()) {
2282          perf_debug("16-wide shader failed to compile, falling back to "
2283                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2284       } else {
2285          simd16_instructions = &v2.instructions;
2286       }
2287    }
2288
2289    c->prog_data.dispatch_width = 8;
2290
2291    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2292    const unsigned *generated = g.generate_assembly(&v.instructions,
2293                                                    simd16_instructions,
2294                                                    final_assembly_size);
2295
2296    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2297       if (shader->compiled_once)
2298          brw_wm_debug_recompile(brw, prog, &c->key);
2299       shader->compiled_once = true;
2300
2301       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2302          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2303                     (get_time() - start_time) * 1000);
2304       }
2305    }
2306
2307    return generated;
2308 }
2309
2310 bool
2311 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2312 {
2313    struct brw_context *brw = brw_context(ctx);
2314    struct intel_context *intel = &brw->intel;
2315    struct brw_wm_prog_key key;
2316
2317    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2318       return true;
2319
2320    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2321       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2322    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2323    bool program_uses_dfdy = fp->UsesDFdy;
2324
2325    memset(&key, 0, sizeof(key));
2326
2327    if (intel->gen < 6) {
2328       if (fp->UsesKill)
2329          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2330
2331       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2332          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2333
2334       /* Just assume depth testing. */
2335       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2336       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2337    }
2338
2339    if (prog->Name != 0)
2340       key.proj_attrib_mask = 0xffffffff;
2341
2342    if (intel->gen < 6)
2343       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2344
2345    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2346       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2347          continue;
2348
2349       if (prog->Name == 0)
2350          key.proj_attrib_mask |= 1 << i;
2351
2352       if (intel->gen < 6) {
2353          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2354
2355          if (vp_index >= 0)
2356             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2357       }
2358    }
2359
2360    key.clamp_fragment_color = true;
2361
2362    for (int i = 0; i < MAX_SAMPLERS; i++) {
2363       if (fp->Base.ShadowSamplers & (1 << i)) {
2364          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2365          key.tex.swizzles[i] =
2366             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2367       } else {
2368          /* Color sampler: assume no swizzling. */
2369          key.tex.swizzles[i] = SWIZZLE_XYZW;
2370       }
2371    }
2372
2373    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2374       key.drawable_height = ctx->DrawBuffer->Height;
2375    }
2376
2377    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2378       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2379    }
2380
2381    key.nr_color_regions = 1;
2382
2383    key.program_string_id = bfp->id;
2384
2385    uint32_t old_prog_offset = brw->wm.prog_offset;
2386    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2387
2388    bool success = do_wm_prog(brw, prog, bfp, &key);
2389
2390    brw->wm.prog_offset = old_prog_offset;
2391    brw->wm.prog_data = old_prog_data;
2392
2393    return success;
2394 }