src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 bool
 223 fs_inst::equals(fs_inst *inst)
 224 {
 225    return (opcode == inst->opcode &&
 226            dst.equals(inst->dst) &&
 227            src[0].equals(inst->src[0]) &&
 228            src[1].equals(inst->src[1]) &&
 229            src[2].equals(inst->src[2]) &&
 230            saturate == inst->saturate &&
 231            predicate == inst->predicate &&
 232            conditional_mod == inst->conditional_mod &&
 233            mlen == inst->mlen &&
 234            base_mrf == inst->base_mrf &&
 235            sampler == inst->sampler &&
 236            target == inst->target &&
 237            eot == inst->eot &&
 238            header_present == inst->header_present &&
 239            shadow_compare == inst->shadow_compare &&
 240            offset == inst->offset);
 241 }
 242
 243 int
 244 fs_inst::regs_written()
 245 {
 246    if (is_tex())
 247       return 4;
 248
 249    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 250     * but we don't currently use them...nor do we have an opcode for them.
 251     */
 252
 253    return 1;
 254 }
 255
 256 bool
 257 fs_inst::overwrites_reg(const fs_reg &reg)
 258 {
 259    return (reg.file == dst.file &&
 260            reg.reg == dst.reg &&
 261            reg.reg_offset >= dst.reg_offset  &&
 262            reg.reg_offset < dst.reg_offset + regs_written());
 263 }
 264
 265 bool
 266 fs_inst::is_tex()
 267 {
 268    return (opcode == SHADER_OPCODE_TEX ||
 269            opcode == FS_OPCODE_TXB ||
 270            opcode == SHADER_OPCODE_TXD ||
 271            opcode == SHADER_OPCODE_TXF ||
 272            opcode == SHADER_OPCODE_TXL ||
 273            opcode == SHADER_OPCODE_TXS);
 274 }
 275
 276 bool
 277 fs_inst::is_math()
 278 {
 279    return (opcode == SHADER_OPCODE_RCP ||
 280            opcode == SHADER_OPCODE_RSQ ||
 281            opcode == SHADER_OPCODE_SQRT ||
 282            opcode == SHADER_OPCODE_EXP2 ||
 283            opcode == SHADER_OPCODE_LOG2 ||
 284            opcode == SHADER_OPCODE_SIN ||
 285            opcode == SHADER_OPCODE_COS ||
 286            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 287            opcode == SHADER_OPCODE_INT_REMAINDER ||
 288            opcode == SHADER_OPCODE_POW);
 289 }
 290
 291 void
 292 fs_reg::init()
 293 {
 294    memset(this, 0, sizeof(*this));
 295    this->smear = -1;
 296 }
 297
 298 /** Generic unset register constructor. */
 299 fs_reg::fs_reg()
 300 {
 301    init();
 302    this->file = BAD_FILE;
 303 }
 304
 305 /** Immediate value constructor. */
 306 fs_reg::fs_reg(float f)
 307 {
 308    init();
 309    this->file = IMM;
 310    this->type = BRW_REGISTER_TYPE_F;
 311    this->imm.f = f;
 312 }
 313
 314 /** Immediate value constructor. */
 315 fs_reg::fs_reg(int32_t i)
 316 {
 317    init();
 318    this->file = IMM;
 319    this->type = BRW_REGISTER_TYPE_D;
 320    this->imm.i = i;
 321 }
 322
 323 /** Immediate value constructor. */
 324 fs_reg::fs_reg(uint32_t u)
 325 {
 326    init();
 327    this->file = IMM;
 328    this->type = BRW_REGISTER_TYPE_UD;
 329    this->imm.u = u;
 330 }
 331
 332 /** Fixed brw_reg Immediate value constructor. */
 333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 334 {
 335    init();
 336    this->file = FIXED_HW_REG;
 337    this->fixed_hw_reg = fixed_hw_reg;
 338    this->type = fixed_hw_reg.type;
 339 }
 340
 341 bool
 342 fs_reg::equals(const fs_reg &r) const
 343 {
 344    return (file == r.file &&
 345            reg == r.reg &&
 346            reg_offset == r.reg_offset &&
 347            type == r.type &&
 348            negate == r.negate &&
 349            abs == r.abs &&
 350            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 351                   sizeof(fixed_hw_reg)) == 0 &&
 352            smear == r.smear &&
 353            imm.u == r.imm.u);
 354 }
 355
 356 int
 357 fs_visitor::type_size(const struct glsl_type *type)
 358 {
 359    unsigned int size, i;
 360
 361    switch (type->base_type) {
 362    case GLSL_TYPE_UINT:
 363    case GLSL_TYPE_INT:
 364    case GLSL_TYPE_FLOAT:
 365    case GLSL_TYPE_BOOL:
 366       return type->components();
 367    case GLSL_TYPE_ARRAY:
 368       return type_size(type->fields.array) * type->length;
 369    case GLSL_TYPE_STRUCT:
 370       size = 0;
 371       for (i = 0; i < type->length; i++) {
 372          size += type_size(type->fields.structure[i].type);
 373       }
 374       return size;
 375    case GLSL_TYPE_SAMPLER:
 376       /* Samplers take up no register space, since they're baked in at
 377        * link time.
 378        */
 379       return 0;
 380    default:
 381       assert(!"not reached");
 382       return 0;
 383    }
 384 }
 385
 386 void
 387 fs_visitor::fail(const char *format, ...)
 388 {
 389    va_list va;
 390    char *msg;
 391
 392    if (failed)
 393       return;
 394
 395    failed = true;
 396
 397    va_start(va, format);
 398    msg = ralloc_vasprintf(mem_ctx, format, va);
 399    va_end(va);
 400    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 401
 402    this->fail_msg = msg;
 403
 404    if (INTEL_DEBUG & DEBUG_WM) {
 405       fprintf(stderr, "%s",  msg);
 406    }
 407 }
 408
 409 fs_inst *
 410 fs_visitor::emit(enum opcode opcode)
 411 {
 412    return emit(fs_inst(opcode));
 413 }
 414
 415 fs_inst *
 416 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 417 {
 418    return emit(fs_inst(opcode, dst));
 419 }
 420
 421 fs_inst *
 422 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 423 {
 424    return emit(fs_inst(opcode, dst, src0));
 425 }
 426
 427 fs_inst *
 428 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 429 {
 430    return emit(fs_inst(opcode, dst, src0, src1));
 431 }
 432
 433 fs_inst *
 434 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 435                  fs_reg src0, fs_reg src1, fs_reg src2)
 436 {
 437    return emit(fs_inst(opcode, dst, src0, src1, src2));
 438 }
 439
 440 void
 441 fs_visitor::push_force_uncompressed()
 442 {
 443    force_uncompressed_stack++;
 444 }
 445
 446 void
 447 fs_visitor::pop_force_uncompressed()
 448 {
 449    force_uncompressed_stack--;
 450    assert(force_uncompressed_stack >= 0);
 451 }
 452
 453 void
 454 fs_visitor::push_force_sechalf()
 455 {
 456    force_sechalf_stack++;
 457 }
 458
 459 void
 460 fs_visitor::pop_force_sechalf()
 461 {
 462    force_sechalf_stack--;
 463    assert(force_sechalf_stack >= 0);
 464 }
 465
 466 /**
 467  * Returns how many MRFs an FS opcode will write over.
 468  *
 469  * Note that this is not the 0 or 1 implied writes in an actual gen
 470  * instruction -- the FS opcodes often generate MOVs in addition.
 471  */
 472 int
 473 fs_visitor::implied_mrf_writes(fs_inst *inst)
 474 {
 475    if (inst->mlen == 0)
 476       return 0;
 477
 478    switch (inst->opcode) {
 479    case SHADER_OPCODE_RCP:
 480    case SHADER_OPCODE_RSQ:
 481    case SHADER_OPCODE_SQRT:
 482    case SHADER_OPCODE_EXP2:
 483    case SHADER_OPCODE_LOG2:
 484    case SHADER_OPCODE_SIN:
 485    case SHADER_OPCODE_COS:
 486       return 1 * dispatch_width / 8;
 487    case SHADER_OPCODE_POW:
 488    case SHADER_OPCODE_INT_QUOTIENT:
 489    case SHADER_OPCODE_INT_REMAINDER:
 490       return 2 * dispatch_width / 8;
 491    case SHADER_OPCODE_TEX:
 492    case FS_OPCODE_TXB:
 493    case SHADER_OPCODE_TXD:
 494    case SHADER_OPCODE_TXF:
 495    case SHADER_OPCODE_TXL:
 496    case SHADER_OPCODE_TXS:
 497       return 1;
 498    case FS_OPCODE_FB_WRITE:
 499       return 2;
 500    case FS_OPCODE_PULL_CONSTANT_LOAD:
 501    case FS_OPCODE_UNSPILL:
 502       return 1;
 503    case FS_OPCODE_SPILL:
 504       return 2;
 505    default:
 506       assert(!"not reached");
 507       return inst->mlen;
 508    }
 509 }
 510
 511 int
 512 fs_visitor::virtual_grf_alloc(int size)
 513 {
 514    if (virtual_grf_array_size <= virtual_grf_count) {
 515       if (virtual_grf_array_size == 0)
 516          virtual_grf_array_size = 16;
 517       else
 518          virtual_grf_array_size *= 2;
 519       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 520                                    virtual_grf_array_size);
 521    }
 522    virtual_grf_sizes[virtual_grf_count] = size;
 523    return virtual_grf_count++;
 524 }
 525
 526 /** Fixed HW reg constructor. */
 527 fs_reg::fs_reg(enum register_file file, int reg)
 528 {
 529    init();
 530    this->file = file;
 531    this->reg = reg;
 532    this->type = BRW_REGISTER_TYPE_F;
 533 }
 534
 535 /** Fixed HW reg constructor. */
 536 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 537 {
 538    init();
 539    this->file = file;
 540    this->reg = reg;
 541    this->type = type;
 542 }
 543
 544 /** Automatic reg constructor. */
 545 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 546 {
 547    init();
 548
 549    this->file = GRF;
 550    this->reg = v->virtual_grf_alloc(v->type_size(type));
 551    this->reg_offset = 0;
 552    this->type = brw_type_for_base_type(type);
 553 }
 554
 555 fs_reg *
 556 fs_visitor::variable_storage(ir_variable *var)
 557 {
 558    return (fs_reg *)hash_table_find(this->variable_ht, var);
 559 }
 560
 561 void
 562 import_uniforms_callback(const void *key,
 563                          void *data,
 564                          void *closure)
 565 {
 566    struct hash_table *dst_ht = (struct hash_table *)closure;
 567    const fs_reg *reg = (const fs_reg *)data;
 568
 569    if (reg->file != UNIFORM)
 570       return;
 571
 572    hash_table_insert(dst_ht, data, key);
 573 }
 574
 575 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 576  * This brings in those uniform definitions
 577  */
 578 void
 579 fs_visitor::import_uniforms(fs_visitor *v)
 580 {
 581    hash_table_call_foreach(v->variable_ht,
 582                            import_uniforms_callback,
 583                            variable_ht);
 584    this->params_remap = v->params_remap;
 585 }
 586
 587 /* Our support for uniforms is piggy-backed on the struct
 588  * gl_fragment_program, because that's where the values actually
 589  * get stored, rather than in some global gl_shader_program uniform
 590  * store.
 591  */
 592 int
 593 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 594 {
 595    unsigned int offset = 0;
 596
 597    if (type->is_matrix()) {
 598       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 599                                                         type->vector_elements,
 600                                                         1);
 601
 602       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 603          offset += setup_uniform_values(loc + offset, column);
 604       }
 605
 606       return offset;
 607    }
 608
 609    switch (type->base_type) {
 610    case GLSL_TYPE_FLOAT:
 611    case GLSL_TYPE_UINT:
 612    case GLSL_TYPE_INT:
 613    case GLSL_TYPE_BOOL:
 614       for (unsigned int i = 0; i < type->vector_elements; i++) {
 615          unsigned int param = c->prog_data.nr_params++;
 616
 617          this->param_index[param] = loc;
 618          this->param_offset[param] = i;
 619       }
 620       return 1;
 621
 622    case GLSL_TYPE_STRUCT:
 623       for (unsigned int i = 0; i < type->length; i++) {
 624          offset += setup_uniform_values(loc + offset,
 625                                         type->fields.structure[i].type);
 626       }
 627       return offset;
 628
 629    case GLSL_TYPE_ARRAY:
 630       for (unsigned int i = 0; i < type->length; i++) {
 631          offset += setup_uniform_values(loc + offset, type->fields.array);
 632       }
 633       return offset;
 634
 635    case GLSL_TYPE_SAMPLER:
 636       /* The sampler takes up a slot, but we don't use any values from it. */
 637       return 1;
 638
 639    default:
 640       assert(!"not reached");
 641       return 0;
 642    }
 643 }
 644
 645
 646 /* Our support for builtin uniforms is even scarier than non-builtin.
 647  * It sits on top of the PROG_STATE_VAR parameters that are
 648  * automatically updated from GL context state.
 649  */
 650 void
 651 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 652 {
 653    const ir_state_slot *const slots = ir->state_slots;
 654    assert(ir->state_slots != NULL);
 655
 656    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 657       /* This state reference has already been setup by ir_to_mesa, but we'll
 658        * get the same index back here.
 659        */
 660       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 661                                             (gl_state_index *)slots[i].tokens);
 662
 663       /* Add each of the unique swizzles of the element as a parameter.
 664        * This'll end up matching the expected layout of the
 665        * array/matrix/structure we're trying to fill in.
 666        */
 667       int last_swiz = -1;
 668       for (unsigned int j = 0; j < 4; j++) {
 669          int swiz = GET_SWZ(slots[i].swizzle, j);
 670          if (swiz == last_swiz)
 671             break;
 672          last_swiz = swiz;
 673
 674          this->param_index[c->prog_data.nr_params] = index;
 675          this->param_offset[c->prog_data.nr_params] = swiz;
 676          c->prog_data.nr_params++;
 677       }
 678    }
 679 }
 680
 681 fs_reg *
 682 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 683 {
 684    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 685    fs_reg wpos = *reg;
 686    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 687
 688    /* gl_FragCoord.x */
 689    if (ir->pixel_center_integer) {
 690       emit(MOV(wpos, this->pixel_x));
 691    } else {
 692       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 693    }
 694    wpos.reg_offset++;
 695
 696    /* gl_FragCoord.y */
 697    if (!flip && ir->pixel_center_integer) {
 698       emit(MOV(wpos, this->pixel_y));
 699    } else {
 700       fs_reg pixel_y = this->pixel_y;
 701       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 702
 703       if (flip) {
 704          pixel_y.negate = true;
 705          offset += c->key.drawable_height - 1.0;
 706       }
 707
 708       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 709    }
 710    wpos.reg_offset++;
 711
 712    /* gl_FragCoord.z */
 713    if (intel->gen >= 6) {
 714       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 715    } else {
 716       emit(FS_OPCODE_LINTERP, wpos,
 717            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 718            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 719            interp_reg(FRAG_ATTRIB_WPOS, 2));
 720    }
 721    wpos.reg_offset++;
 722
 723    /* gl_FragCoord.w: Already set up in emit_interpolation */
 724    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 725
 726    return reg;
 727 }
 728
 729 fs_inst *
 730 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 731                          glsl_interp_qualifier interpolation_mode,
 732                          bool is_centroid)
 733 {
 734    brw_wm_barycentric_interp_mode barycoord_mode;
 735    if (is_centroid) {
 736       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 737          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 738       else
 739          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 740    } else {
 741       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 742          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 743       else
 744          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 745    }
 746    return emit(FS_OPCODE_LINTERP, attr,
 747                this->delta_x[barycoord_mode],
 748                this->delta_y[barycoord_mode], interp);
 749 }
 750
 751 fs_reg *
 752 fs_visitor::emit_general_interpolation(ir_variable *ir)
 753 {
 754    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 755    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 756    fs_reg attr = *reg;
 757
 758    unsigned int array_elements;
 759    const glsl_type *type;
 760
 761    if (ir->type->is_array()) {
 762       array_elements = ir->type->length;
 763       if (array_elements == 0) {
 764          fail("dereferenced array '%s' has length 0\n", ir->name);
 765       }
 766       type = ir->type->fields.array;
 767    } else {
 768       array_elements = 1;
 769       type = ir->type;
 770    }
 771
 772    glsl_interp_qualifier interpolation_mode =
 773       ir->determine_interpolation_mode(c->key.flat_shade);
 774
 775    int location = ir->location;
 776    for (unsigned int i = 0; i < array_elements; i++) {
 777       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 778          if (urb_setup[location] == -1) {
 779             /* If there's no incoming setup data for this slot, don't
 780              * emit interpolation for it.
 781              */
 782             attr.reg_offset += type->vector_elements;
 783             location++;
 784             continue;
 785          }
 786
 787          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 788             /* Constant interpolation (flat shading) case. The SF has
 789              * handed us defined values in only the constant offset
 790              * field of the setup reg.
 791              */
 792             for (unsigned int k = 0; k < type->vector_elements; k++) {
 793                struct brw_reg interp = interp_reg(location, k);
 794                interp = suboffset(interp, 3);
 795                interp.type = reg->type;
 796                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 797                attr.reg_offset++;
 798             }
 799          } else {
 800             /* Smooth/noperspective interpolation case. */
 801             for (unsigned int k = 0; k < type->vector_elements; k++) {
 802                /* FINISHME: At some point we probably want to push
 803                 * this farther by giving similar treatment to the
 804                 * other potentially constant components of the
 805                 * attribute, as well as making brw_vs_constval.c
 806                 * handle varyings other than gl_TexCoord.
 807                 */
 808                if (location >= FRAG_ATTRIB_TEX0 &&
 809                    location <= FRAG_ATTRIB_TEX7 &&
 810                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
 811                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
 812                } else {
 813                   struct brw_reg interp = interp_reg(location, k);
 814                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
 815                                ir->centroid);
 816                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
 817                      /* Get the pixel/sample mask into f0 so that we know
 818                       * which pixels are lit.  Then, for each channel that is
 819                       * unlit, replace the centroid data with non-centroid
 820                       * data.
 821                       */
 822                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
 823                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
 824                                                   interpolation_mode, false);
 825                      inst->predicate = BRW_PREDICATE_NORMAL;
 826                      inst->predicate_inverse = true;
 827                   }
 828                   if (intel->gen < 6) {
 829                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 830                   }
 831                }
 832                attr.reg_offset++;
 833             }
 834
 835          }
 836          location++;
 837       }
 838    }
 839
 840    return reg;
 841 }
 842
 843 fs_reg *
 844 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 845 {
 846    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 847
 848    /* The frontfacing comes in as a bit in the thread payload. */
 849    if (intel->gen >= 6) {
 850       emit(BRW_OPCODE_ASR, *reg,
 851            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 852            fs_reg(15));
 853       emit(BRW_OPCODE_NOT, *reg, *reg);
 854       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 855    } else {
 856       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 857       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 858        * us front face
 859        */
 860       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
 861       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 862    }
 863
 864    return reg;
 865 }
 866
 867 fs_inst *
 868 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
 869 {
 870    switch (opcode) {
 871    case SHADER_OPCODE_RCP:
 872    case SHADER_OPCODE_RSQ:
 873    case SHADER_OPCODE_SQRT:
 874    case SHADER_OPCODE_EXP2:
 875    case SHADER_OPCODE_LOG2:
 876    case SHADER_OPCODE_SIN:
 877    case SHADER_OPCODE_COS:
 878       break;
 879    default:
 880       assert(!"not reached: bad math opcode");
 881       return NULL;
 882    }
 883
 884    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 885     * might be able to do better by doing execsize = 1 math and then
 886     * expanding that result out, but we would need to be careful with
 887     * masking.
 888     *
 889     * Gen 6 hardware ignores source modifiers (negate and abs) on math
 890     * instructions, so we also move to a temp to set those up.
 891     */
 892    if (intel->gen == 6 && (src.file == UNIFORM ||
 893                            src.abs ||
 894                            src.negate)) {
 895       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 896       emit(BRW_OPCODE_MOV, expanded, src);
 897       src = expanded;
 898    }
 899
 900    fs_inst *inst = emit(opcode, dst, src);
 901
 902    if (intel->gen < 6) {
 903       inst->base_mrf = 2;
 904       inst->mlen = dispatch_width / 8;
 905    }
 906
 907    return inst;
 908 }
 909
 910 fs_inst *
 911 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 912 {
 913    int base_mrf = 2;
 914    fs_inst *inst;
 915
 916    switch (opcode) {
 917    case SHADER_OPCODE_POW:
 918    case SHADER_OPCODE_INT_QUOTIENT:
 919    case SHADER_OPCODE_INT_REMAINDER:
 920       break;
 921    default:
 922       assert(!"not reached: unsupported binary math opcode.");
 923       return NULL;
 924    }
 925
 926    if (intel->gen >= 7) {
 927       inst = emit(opcode, dst, src0, src1);
 928    } else if (intel->gen == 6) {
 929       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 930        *
 931        * The hardware ignores source modifiers (negate and abs) on math
 932        * instructions, so we also move to a temp to set those up.
 933        */
 934       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 935          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 936          expanded.type = src0.type;
 937          emit(BRW_OPCODE_MOV, expanded, src0);
 938          src0 = expanded;
 939       }
 940
 941       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 942          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 943          expanded.type = src1.type;
 944          emit(BRW_OPCODE_MOV, expanded, src1);
 945          src1 = expanded;
 946       }
 947
 948       inst = emit(opcode, dst, src0, src1);
 949    } else {
 950       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 951        * "Message Payload":
 952        *
 953        * "Operand0[7].  For the INT DIV functions, this operand is the
 954        *  denominator."
 955        *  ...
 956        * "Operand1[7].  For the INT DIV functions, this operand is the
 957        *  numerator."
 958        */
 959       bool is_int_div = opcode != SHADER_OPCODE_POW;
 960       fs_reg &op0 = is_int_div ? src1 : src0;
 961       fs_reg &op1 = is_int_div ? src0 : src1;
 962
 963       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
 964       inst = emit(opcode, dst, op0, reg_null_f);
 965
 966       inst->base_mrf = base_mrf;
 967       inst->mlen = 2 * dispatch_width / 8;
 968    }
 969    return inst;
 970 }
 971
 972 /**
 973  * To be called after the last _mesa_add_state_reference() call, to
 974  * set up prog_data.param[] for assign_curb_setup() and
 975  * setup_pull_constants().
 976  */
 977 void
 978 fs_visitor::setup_paramvalues_refs()
 979 {
 980    if (dispatch_width != 8)
 981       return;
 982
 983    /* Set up the pointers to ParamValues now that that array is finalized. */
 984    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
 985       c->prog_data.param[i] =
 986          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
 987          this->param_offset[i];
 988    }
 989 }
 990
 991 void
 992 fs_visitor::assign_curb_setup()
 993 {
 994    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
 995    if (dispatch_width == 8) {
 996       c->prog_data.first_curbe_grf = c->nr_payload_regs;
 997    } else {
 998       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
 999    }
1000
1001    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1002    foreach_list(node, &this->instructions) {
1003       fs_inst *inst = (fs_inst *)node;
1004
1005       for (unsigned int i = 0; i < 3; i++) {
1006          if (inst->src[i].file == UNIFORM) {
1007             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1008             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1009                                                   constant_nr / 8,
1010                                                   constant_nr % 8);
1011
1012             inst->src[i].file = FIXED_HW_REG;
1013             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1014          }
1015       }
1016    }
1017 }
1018
1019 void
1020 fs_visitor::calculate_urb_setup()
1021 {
1022    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1023       urb_setup[i] = -1;
1024    }
1025
1026    int urb_next = 0;
1027    /* Figure out where each of the incoming setup attributes lands. */
1028    if (intel->gen >= 6) {
1029       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1030          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1031             urb_setup[i] = urb_next++;
1032          }
1033       }
1034    } else {
1035       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1036       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1037          /* Point size is packed into the header, not as a general attribute */
1038          if (i == VERT_RESULT_PSIZ)
1039             continue;
1040
1041          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1042             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1043
1044             /* The back color slot is skipped when the front color is
1045              * also written to.  In addition, some slots can be
1046              * written in the vertex shader and not read in the
1047              * fragment shader.  So the register number must always be
1048              * incremented, mapped or not.
1049              */
1050             if (fp_index >= 0)
1051                urb_setup[fp_index] = urb_next;
1052             urb_next++;
1053          }
1054       }
1055
1056       /*
1057        * It's a FS only attribute, and we did interpolation for this attribute
1058        * in SF thread. So, count it here, too.
1059        *
1060        * See compile_sf_prog() for more info.
1061        */
1062       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1063          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1064    }
1065
1066    /* Each attribute is 4 setup channels, each of which is half a reg. */
1067    c->prog_data.urb_read_length = urb_next * 2;
1068 }
1069
1070 void
1071 fs_visitor::assign_urb_setup()
1072 {
1073    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1074
1075    /* Offset all the urb_setup[] index by the actual position of the
1076     * setup regs, now that the location of the constants has been chosen.
1077     */
1078    foreach_list(node, &this->instructions) {
1079       fs_inst *inst = (fs_inst *)node;
1080
1081       if (inst->opcode == FS_OPCODE_LINTERP) {
1082          assert(inst->src[2].file == FIXED_HW_REG);
1083          inst->src[2].fixed_hw_reg.nr += urb_start;
1084       }
1085
1086       if (inst->opcode == FS_OPCODE_CINTERP) {
1087          assert(inst->src[0].file == FIXED_HW_REG);
1088          inst->src[0].fixed_hw_reg.nr += urb_start;
1089       }
1090    }
1091
1092    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1093 }
1094
1095 /**
1096  * Split large virtual GRFs into separate components if we can.
1097  *
1098  * This is mostly duplicated with what brw_fs_vector_splitting does,
1099  * but that's really conservative because it's afraid of doing
1100  * splitting that doesn't result in real progress after the rest of
1101  * the optimization phases, which would cause infinite looping in
1102  * optimization.  We can do it once here, safely.  This also has the
1103  * opportunity to split interpolated values, or maybe even uniforms,
1104  * which we don't have at the IR level.
1105  *
1106  * We want to split, because virtual GRFs are what we register
1107  * allocate and spill (due to contiguousness requirements for some
1108  * instructions), and they're what we naturally generate in the
1109  * codegen process, but most virtual GRFs don't actually need to be
1110  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1111  * live intervals and better dead code elimination and coalescing.
1112  */
1113 void
1114 fs_visitor::split_virtual_grfs()
1115 {
1116    int num_vars = this->virtual_grf_count;
1117    bool split_grf[num_vars];
1118    int new_virtual_grf[num_vars];
1119
1120    /* Try to split anything > 0 sized. */
1121    for (int i = 0; i < num_vars; i++) {
1122       if (this->virtual_grf_sizes[i] != 1)
1123          split_grf[i] = true;
1124       else
1125          split_grf[i] = false;
1126    }
1127
1128    if (brw->has_pln &&
1129        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1130       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1131        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1132        * Gen6, that was the only supported interpolation mode, and since Gen6,
1133        * delta_x and delta_y are in fixed hardware registers.
1134        */
1135       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1136          false;
1137    }
1138
1139    foreach_list(node, &this->instructions) {
1140       fs_inst *inst = (fs_inst *)node;
1141
1142       /* If there's a SEND message that requires contiguous destination
1143        * registers, no splitting is allowed.
1144        */
1145       if (inst->regs_written() > 1) {
1146          split_grf[inst->dst.reg] = false;
1147       }
1148    }
1149
1150    /* Allocate new space for split regs.  Note that the virtual
1151     * numbers will be contiguous.
1152     */
1153    for (int i = 0; i < num_vars; i++) {
1154       if (split_grf[i]) {
1155          new_virtual_grf[i] = virtual_grf_alloc(1);
1156          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1157             int reg = virtual_grf_alloc(1);
1158             assert(reg == new_virtual_grf[i] + j - 1);
1159             (void) reg;
1160          }
1161          this->virtual_grf_sizes[i] = 1;
1162       }
1163    }
1164
1165    foreach_list(node, &this->instructions) {
1166       fs_inst *inst = (fs_inst *)node;
1167
1168       if (inst->dst.file == GRF &&
1169           split_grf[inst->dst.reg] &&
1170           inst->dst.reg_offset != 0) {
1171          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1172                           inst->dst.reg_offset - 1);
1173          inst->dst.reg_offset = 0;
1174       }
1175       for (int i = 0; i < 3; i++) {
1176          if (inst->src[i].file == GRF &&
1177              split_grf[inst->src[i].reg] &&
1178              inst->src[i].reg_offset != 0) {
1179             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1180                                 inst->src[i].reg_offset - 1);
1181             inst->src[i].reg_offset = 0;
1182          }
1183       }
1184    }
1185    this->live_intervals_valid = false;
1186 }
1187
1188 /**
1189  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1190  *
1191  * During code generation, we create tons of temporary variables, many of
1192  * which get immediately killed and are never used again.  Yet, in later
1193  * optimization and analysis passes, such as compute_live_intervals, we need
1194  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1195  * overhead.
1196  */
1197 void
1198 fs_visitor::compact_virtual_grfs()
1199 {
1200    /* Mark which virtual GRFs are used, and count how many. */
1201    int remap_table[this->virtual_grf_count];
1202    memset(remap_table, -1, sizeof(remap_table));
1203
1204    foreach_list(node, &this->instructions) {
1205       const fs_inst *inst = (const fs_inst *) node;
1206
1207       if (inst->dst.file == GRF)
1208          remap_table[inst->dst.reg] = 0;
1209
1210       for (int i = 0; i < 3; i++) {
1211          if (inst->src[i].file == GRF)
1212             remap_table[inst->src[i].reg] = 0;
1213       }
1214    }
1215
1216    /* In addition to registers used in instructions, fs_visitor keeps
1217     * direct references to certain special values which must be patched:
1218     */
1219    fs_reg *special[] = {
1220       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1221       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1222       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1223       &delta_x[0], &delta_x[1], &delta_x[2],
1224       &delta_x[3], &delta_x[4], &delta_x[5],
1225       &delta_y[0], &delta_y[1], &delta_y[2],
1226       &delta_y[3], &delta_y[4], &delta_y[5],
1227    };
1228    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1229    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1230
1231    /* Treat all special values as used, to be conservative */
1232    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1233       if (special[i]->file == GRF)
1234          remap_table[special[i]->reg] = 0;
1235    }
1236
1237    /* Compact the GRF arrays. */
1238    int new_index = 0;
1239    for (int i = 0; i < this->virtual_grf_count; i++) {
1240       if (remap_table[i] != -1) {
1241          remap_table[i] = new_index;
1242          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1243          if (live_intervals_valid) {
1244             virtual_grf_use[new_index] = virtual_grf_use[i];
1245             virtual_grf_def[new_index] = virtual_grf_def[i];
1246          }
1247          ++new_index;
1248       }
1249    }
1250
1251    this->virtual_grf_count = new_index;
1252
1253    /* Patch all the instructions to use the newly renumbered registers */
1254    foreach_list(node, &this->instructions) {
1255       fs_inst *inst = (fs_inst *) node;
1256
1257       if (inst->dst.file == GRF)
1258          inst->dst.reg = remap_table[inst->dst.reg];
1259
1260       for (int i = 0; i < 3; i++) {
1261          if (inst->src[i].file == GRF)
1262             inst->src[i].reg = remap_table[inst->src[i].reg];
1263       }
1264    }
1265
1266    /* Patch all the references to special values */
1267    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1268       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1269          special[i]->reg = remap_table[special[i]->reg];
1270    }
1271 }
1272
1273 bool
1274 fs_visitor::remove_dead_constants()
1275 {
1276    if (dispatch_width == 8) {
1277       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1278
1279       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1280          this->params_remap[i] = -1;
1281
1282       /* Find which params are still in use. */
1283       foreach_list(node, &this->instructions) {
1284          fs_inst *inst = (fs_inst *)node;
1285
1286          for (int i = 0; i < 3; i++) {
1287             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1288
1289             if (inst->src[i].file != UNIFORM)
1290                continue;
1291
1292             assert(constant_nr < (int)c->prog_data.nr_params);
1293
1294             /* For now, set this to non-negative.  We'll give it the
1295              * actual new number in a moment, in order to keep the
1296              * register numbers nicely ordered.
1297              */
1298             this->params_remap[constant_nr] = 0;
1299          }
1300       }
1301
1302       /* Figure out what the new numbers for the params will be.  At some
1303        * point when we're doing uniform array access, we're going to want
1304        * to keep the distinction between .reg and .reg_offset, but for
1305        * now we don't care.
1306        */
1307       unsigned int new_nr_params = 0;
1308       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1309          if (this->params_remap[i] != -1) {
1310             this->params_remap[i] = new_nr_params++;
1311          }
1312       }
1313
1314       /* Update the list of params to be uploaded to match our new numbering. */
1315       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1316          int remapped = this->params_remap[i];
1317
1318          if (remapped == -1)
1319             continue;
1320
1321          /* We've already done setup_paramvalues_refs() so no need to worry
1322           * about param_index and param_offset.
1323           */
1324          c->prog_data.param[remapped] = c->prog_data.param[i];
1325       }
1326
1327       c->prog_data.nr_params = new_nr_params;
1328    } else {
1329       /* This should have been generated in the 8-wide pass already. */
1330       assert(this->params_remap);
1331    }
1332
1333    /* Now do the renumbering of the shader to remove unused params. */
1334    foreach_list(node, &this->instructions) {
1335       fs_inst *inst = (fs_inst *)node;
1336
1337       for (int i = 0; i < 3; i++) {
1338          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1339
1340          if (inst->src[i].file != UNIFORM)
1341             continue;
1342
1343          assert(this->params_remap[constant_nr] != -1);
1344          inst->src[i].reg = this->params_remap[constant_nr];
1345          inst->src[i].reg_offset = 0;
1346       }
1347    }
1348
1349    return true;
1350 }
1351
1352 /**
1353  * Choose accesses from the UNIFORM file to demote to using the pull
1354  * constant buffer.
1355  *
1356  * We allow a fragment shader to have more than the specified minimum
1357  * maximum number of fragment shader uniform components (64).  If
1358  * there are too many of these, they'd fill up all of register space.
1359  * So, this will push some of them out to the pull constant buffer and
1360  * update the program to load them.
1361  */
1362 void
1363 fs_visitor::setup_pull_constants()
1364 {
1365    /* Only allow 16 registers (128 uniform components) as push constants. */
1366    unsigned int max_uniform_components = 16 * 8;
1367    if (c->prog_data.nr_params <= max_uniform_components)
1368       return;
1369
1370    if (dispatch_width == 16) {
1371       fail("Pull constants not supported in 16-wide\n");
1372       return;
1373    }
1374
1375    /* Just demote the end of the list.  We could probably do better
1376     * here, demoting things that are rarely used in the program first.
1377     */
1378    int pull_uniform_base = max_uniform_components;
1379    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1380
1381    foreach_list(node, &this->instructions) {
1382       fs_inst *inst = (fs_inst *)node;
1383
1384       for (int i = 0; i < 3; i++) {
1385          if (inst->src[i].file != UNIFORM)
1386             continue;
1387
1388          int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1389          if (uniform_nr < pull_uniform_base)
1390             continue;
1391
1392          fs_reg dst = fs_reg(this, glsl_type::float_type);
1393          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1394          fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1395                                              pull_uniform_base) * 4) & ~15));
1396          fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
1397                                               dst, index, offset);
1398          pull->ir = inst->ir;
1399          pull->annotation = inst->annotation;
1400          pull->base_mrf = 14;
1401          pull->mlen = 1;
1402
1403          inst->insert_before(pull);
1404
1405          inst->src[i].file = GRF;
1406          inst->src[i].reg = dst.reg;
1407          inst->src[i].reg_offset = 0;
1408          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1409       }
1410    }
1411
1412    for (int i = 0; i < pull_uniform_count; i++) {
1413       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1414    }
1415    c->prog_data.nr_params -= pull_uniform_count;
1416    c->prog_data.nr_pull_params = pull_uniform_count;
1417 }
1418
1419 bool
1420 fs_visitor::opt_algebraic()
1421 {
1422    bool progress = false;
1423
1424    foreach_list(node, &this->instructions) {
1425       fs_inst *inst = (fs_inst *)node;
1426
1427       switch (inst->opcode) {
1428       case BRW_OPCODE_MUL:
1429          if (inst->src[1].file != IMM)
1430             continue;
1431
1432          /* a * 1.0 = a */
1433          if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1434              inst->src[1].imm.f == 1.0) {
1435             inst->opcode = BRW_OPCODE_MOV;
1436             inst->src[1] = reg_undef;
1437             progress = true;
1438             break;
1439          }
1440
1441          /* a * 0.0 = 0.0 */
1442          if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1443              inst->src[1].imm.f == 0.0) {
1444             inst->opcode = BRW_OPCODE_MOV;
1445             inst->src[0] = fs_reg(0.0f);
1446             inst->src[1] = reg_undef;
1447             progress = true;
1448             break;
1449          }
1450
1451          break;
1452       case BRW_OPCODE_ADD:
1453          if (inst->src[1].file != IMM)
1454             continue;
1455
1456          /* a + 0.0 = a */
1457          if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1458              inst->src[1].imm.f == 0.0) {
1459             inst->opcode = BRW_OPCODE_MOV;
1460             inst->src[1] = reg_undef;
1461             progress = true;
1462             break;
1463          }
1464          break;
1465       default:
1466          break;
1467       }
1468    }
1469
1470    return progress;
1471 }
1472
1473 /**
1474  * Must be called after calculate_live_intervales() to remove unused
1475  * writes to registers -- register allocation will fail otherwise
1476  * because something deffed but not used won't be considered to
1477  * interfere with other regs.
1478  */
1479 bool
1480 fs_visitor::dead_code_eliminate()
1481 {
1482    bool progress = false;
1483    int pc = 0;
1484
1485    calculate_live_intervals();
1486
1487    foreach_list_safe(node, &this->instructions) {
1488       fs_inst *inst = (fs_inst *)node;
1489
1490       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1491          inst->remove();
1492          progress = true;
1493       }
1494
1495       pc++;
1496    }
1497
1498    if (progress)
1499       live_intervals_valid = false;
1500
1501    return progress;
1502 }
1503
1504 /**
1505  * Implements a second type of register coalescing: This one checks if
1506  * the two regs involved in a raw move don't interfere, in which case
1507  * they can both by stored in the same place and the MOV removed.
1508  */
1509 bool
1510 fs_visitor::register_coalesce_2()
1511 {
1512    bool progress = false;
1513
1514    calculate_live_intervals();
1515
1516    foreach_list_safe(node, &this->instructions) {
1517       fs_inst *inst = (fs_inst *)node;
1518
1519       if (inst->opcode != BRW_OPCODE_MOV ||
1520           inst->predicate ||
1521           inst->saturate ||
1522           inst->src[0].file != GRF ||
1523           inst->src[0].negate ||
1524           inst->src[0].abs ||
1525           inst->src[0].smear != -1 ||
1526           inst->dst.file != GRF ||
1527           inst->dst.type != inst->src[0].type ||
1528           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1529           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1530          continue;
1531       }
1532
1533       int reg_from = inst->src[0].reg;
1534       assert(inst->src[0].reg_offset == 0);
1535       int reg_to = inst->dst.reg;
1536       int reg_to_offset = inst->dst.reg_offset;
1537
1538       foreach_list_safe(node, &this->instructions) {
1539          fs_inst *scan_inst = (fs_inst *)node;
1540
1541          if (scan_inst->dst.file == GRF &&
1542              scan_inst->dst.reg == reg_from) {
1543             scan_inst->dst.reg = reg_to;
1544             scan_inst->dst.reg_offset = reg_to_offset;
1545          }
1546          for (int i = 0; i < 3; i++) {
1547             if (scan_inst->src[i].file == GRF &&
1548                 scan_inst->src[i].reg == reg_from) {
1549                scan_inst->src[i].reg = reg_to;
1550                scan_inst->src[i].reg_offset = reg_to_offset;
1551             }
1552          }
1553       }
1554
1555       inst->remove();
1556       live_intervals_valid = false;
1557       progress = true;
1558       continue;
1559    }
1560
1561    return progress;
1562 }
1563
1564 bool
1565 fs_visitor::register_coalesce()
1566 {
1567    bool progress = false;
1568    int if_depth = 0;
1569    int loop_depth = 0;
1570
1571    foreach_list_safe(node, &this->instructions) {
1572       fs_inst *inst = (fs_inst *)node;
1573
1574       /* Make sure that we dominate the instructions we're going to
1575        * scan for interfering with our coalescing, or we won't have
1576        * scanned enough to see if anything interferes with our
1577        * coalescing.  We don't dominate the following instructions if
1578        * we're in a loop or an if block.
1579        */
1580       switch (inst->opcode) {
1581       case BRW_OPCODE_DO:
1582          loop_depth++;
1583          break;
1584       case BRW_OPCODE_WHILE:
1585          loop_depth--;
1586          break;
1587       case BRW_OPCODE_IF:
1588          if_depth++;
1589          break;
1590       case BRW_OPCODE_ENDIF:
1591          if_depth--;
1592          break;
1593       default:
1594          break;
1595       }
1596       if (loop_depth || if_depth)
1597          continue;
1598
1599       if (inst->opcode != BRW_OPCODE_MOV ||
1600           inst->predicate ||
1601           inst->saturate ||
1602           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1603                                     inst->src[0].file != UNIFORM)||
1604           inst->dst.type != inst->src[0].type)
1605          continue;
1606
1607       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1608
1609       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1610        * them: check for no writes to either one until the exit of the
1611        * program.
1612        */
1613       bool interfered = false;
1614
1615       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1616            !scan_inst->is_tail_sentinel();
1617            scan_inst = (fs_inst *)scan_inst->next) {
1618          if (scan_inst->dst.file == GRF) {
1619             if (scan_inst->overwrites_reg(inst->dst) ||
1620                 scan_inst->overwrites_reg(inst->src[0])) {
1621                interfered = true;
1622                break;
1623             }
1624          }
1625
1626          /* The gen6 MATH instruction can't handle source modifiers or
1627           * unusual register regions, so avoid coalescing those for
1628           * now.  We should do something more specific.
1629           */
1630          if (intel->gen >= 6 &&
1631              scan_inst->is_math() &&
1632              (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1633             interfered = true;
1634             break;
1635          }
1636
1637          /* The accumulator result appears to get used for the
1638           * conditional modifier generation.  When negating a UD
1639           * value, there is a 33rd bit generated for the sign in the
1640           * accumulator value, so now you can't check, for example,
1641           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1642           */
1643          if (scan_inst->conditional_mod &&
1644              inst->src[0].negate &&
1645              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1646             interfered = true;
1647             break;
1648          }
1649       }
1650       if (interfered) {
1651          continue;
1652       }
1653
1654       /* Rewrite the later usage to point at the source of the move to
1655        * be removed.
1656        */
1657       for (fs_inst *scan_inst = inst;
1658            !scan_inst->is_tail_sentinel();
1659            scan_inst = (fs_inst *)scan_inst->next) {
1660          for (int i = 0; i < 3; i++) {
1661             if (scan_inst->src[i].file == GRF &&
1662                 scan_inst->src[i].reg == inst->dst.reg &&
1663                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1664                fs_reg new_src = inst->src[0];
1665                if (scan_inst->src[i].abs) {
1666                   new_src.negate = 0;
1667                   new_src.abs = 1;
1668                }
1669                new_src.negate ^= scan_inst->src[i].negate;
1670                scan_inst->src[i] = new_src;
1671             }
1672          }
1673       }
1674
1675       inst->remove();
1676       progress = true;
1677    }
1678
1679    if (progress)
1680       live_intervals_valid = false;
1681
1682    return progress;
1683 }
1684
1685
1686 bool
1687 fs_visitor::compute_to_mrf()
1688 {
1689    bool progress = false;
1690    int next_ip = 0;
1691
1692    calculate_live_intervals();
1693
1694    foreach_list_safe(node, &this->instructions) {
1695       fs_inst *inst = (fs_inst *)node;
1696
1697       int ip = next_ip;
1698       next_ip++;
1699
1700       if (inst->opcode != BRW_OPCODE_MOV ||
1701           inst->predicate ||
1702           inst->dst.file != MRF || inst->src[0].file != GRF ||
1703           inst->dst.type != inst->src[0].type ||
1704           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1705          continue;
1706
1707       /* Work out which hardware MRF registers are written by this
1708        * instruction.
1709        */
1710       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1711       int mrf_high;
1712       if (inst->dst.reg & BRW_MRF_COMPR4) {
1713          mrf_high = mrf_low + 4;
1714       } else if (dispatch_width == 16 &&
1715                  (!inst->force_uncompressed && !inst->force_sechalf)) {
1716          mrf_high = mrf_low + 1;
1717       } else {
1718          mrf_high = mrf_low;
1719       }
1720
1721       /* Can't compute-to-MRF this GRF if someone else was going to
1722        * read it later.
1723        */
1724       if (this->virtual_grf_use[inst->src[0].reg] > ip)
1725          continue;
1726
1727       /* Found a move of a GRF to a MRF.  Let's see if we can go
1728        * rewrite the thing that made this GRF to write into the MRF.
1729        */
1730       fs_inst *scan_inst;
1731       for (scan_inst = (fs_inst *)inst->prev;
1732            scan_inst->prev != NULL;
1733            scan_inst = (fs_inst *)scan_inst->prev) {
1734          if (scan_inst->dst.file == GRF &&
1735              scan_inst->dst.reg == inst->src[0].reg) {
1736             /* Found the last thing to write our reg we want to turn
1737              * into a compute-to-MRF.
1738              */
1739
1740             /* SENDs can only write to GRFs, so no compute-to-MRF. */
1741             if (scan_inst->mlen) {
1742                break;
1743             }
1744
1745             /* If it's predicated, it (probably) didn't populate all
1746              * the channels.  We might be able to rewrite everything
1747              * that writes that reg, but it would require smarter
1748              * tracking to delay the rewriting until complete success.
1749              */
1750             if (scan_inst->predicate)
1751                break;
1752
1753             /* If it's half of register setup and not the same half as
1754              * our MOV we're trying to remove, bail for now.
1755              */
1756             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1757                 scan_inst->force_sechalf != inst->force_sechalf) {
1758                break;
1759             }
1760
1761             /* SEND instructions can't have MRF as a destination. */
1762             if (scan_inst->mlen)
1763                break;
1764
1765             if (intel->gen >= 6) {
1766                /* gen6 math instructions must have the destination be
1767                 * GRF, so no compute-to-MRF for them.
1768                 */
1769                if (scan_inst->is_math()) {
1770                   break;
1771                }
1772             }
1773
1774             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1775                /* Found the creator of our MRF's source value. */
1776                scan_inst->dst.file = MRF;
1777                scan_inst->dst.reg = inst->dst.reg;
1778                scan_inst->saturate |= inst->saturate;
1779                inst->remove();
1780                progress = true;
1781             }
1782             break;
1783          }
1784
1785          /* We don't handle flow control here.  Most computation of
1786           * values that end up in MRFs are shortly before the MRF
1787           * write anyway.
1788           */
1789          if (scan_inst->opcode == BRW_OPCODE_DO ||
1790              scan_inst->opcode == BRW_OPCODE_WHILE ||
1791              scan_inst->opcode == BRW_OPCODE_ELSE ||
1792              scan_inst->opcode == BRW_OPCODE_ENDIF) {
1793             break;
1794          }
1795
1796          /* You can't read from an MRF, so if someone else reads our
1797           * MRF's source GRF that we wanted to rewrite, that stops us.
1798           */
1799          bool interfered = false;
1800          for (int i = 0; i < 3; i++) {
1801             if (scan_inst->src[i].file == GRF &&
1802                 scan_inst->src[i].reg == inst->src[0].reg &&
1803                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1804                interfered = true;
1805             }
1806          }
1807          if (interfered)
1808             break;
1809
1810          if (scan_inst->dst.file == MRF) {
1811             /* If somebody else writes our MRF here, we can't
1812              * compute-to-MRF before that.
1813              */
1814             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1815             int scan_mrf_high;
1816
1817             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1818                scan_mrf_high = scan_mrf_low + 4;
1819             } else if (dispatch_width == 16 &&
1820                        (!scan_inst->force_uncompressed &&
1821                         !scan_inst->force_sechalf)) {
1822                scan_mrf_high = scan_mrf_low + 1;
1823             } else {
1824                scan_mrf_high = scan_mrf_low;
1825             }
1826
1827             if (mrf_low == scan_mrf_low ||
1828                 mrf_low == scan_mrf_high ||
1829                 mrf_high == scan_mrf_low ||
1830                 mrf_high == scan_mrf_high) {
1831                break;
1832             }
1833          }
1834
1835          if (scan_inst->mlen > 0) {
1836             /* Found a SEND instruction, which means that there are
1837              * live values in MRFs from base_mrf to base_mrf +
1838              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1839              * above it.
1840              */
1841             if (mrf_low >= scan_inst->base_mrf &&
1842                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1843                break;
1844             }
1845             if (mrf_high >= scan_inst->base_mrf &&
1846                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1847                break;
1848             }
1849          }
1850       }
1851    }
1852
1853    if (progress)
1854       live_intervals_valid = false;
1855
1856    return progress;
1857 }
1858
1859 /**
1860  * Walks through basic blocks, looking for repeated MRF writes and
1861  * removing the later ones.
1862  */
1863 bool
1864 fs_visitor::remove_duplicate_mrf_writes()
1865 {
1866    fs_inst *last_mrf_move[16];
1867    bool progress = false;
1868
1869    /* Need to update the MRF tracking for compressed instructions. */
1870    if (dispatch_width == 16)
1871       return false;
1872
1873    memset(last_mrf_move, 0, sizeof(last_mrf_move));
1874
1875    foreach_list_safe(node, &this->instructions) {
1876       fs_inst *inst = (fs_inst *)node;
1877
1878       switch (inst->opcode) {
1879       case BRW_OPCODE_DO:
1880       case BRW_OPCODE_WHILE:
1881       case BRW_OPCODE_IF:
1882       case BRW_OPCODE_ELSE:
1883       case BRW_OPCODE_ENDIF:
1884          memset(last_mrf_move, 0, sizeof(last_mrf_move));
1885          continue;
1886       default:
1887          break;
1888       }
1889
1890       if (inst->opcode == BRW_OPCODE_MOV &&
1891           inst->dst.file == MRF) {
1892          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1893          if (prev_inst && inst->equals(prev_inst)) {
1894             inst->remove();
1895             progress = true;
1896             continue;
1897          }
1898       }
1899
1900       /* Clear out the last-write records for MRFs that were overwritten. */
1901       if (inst->dst.file == MRF) {
1902          last_mrf_move[inst->dst.reg] = NULL;
1903       }
1904
1905       if (inst->mlen > 0) {
1906          /* Found a SEND instruction, which will include two or fewer
1907           * implied MRF writes.  We could do better here.
1908           */
1909          for (int i = 0; i < implied_mrf_writes(inst); i++) {
1910             last_mrf_move[inst->base_mrf + i] = NULL;
1911          }
1912       }
1913
1914       /* Clear out any MRF move records whose sources got overwritten. */
1915       if (inst->dst.file == GRF) {
1916          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1917             if (last_mrf_move[i] &&
1918                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1919                last_mrf_move[i] = NULL;
1920             }
1921          }
1922       }
1923
1924       if (inst->opcode == BRW_OPCODE_MOV &&
1925           inst->dst.file == MRF &&
1926           inst->src[0].file == GRF &&
1927           !inst->predicate) {
1928          last_mrf_move[inst->dst.reg] = inst;
1929       }
1930    }
1931
1932    if (progress)
1933       live_intervals_valid = false;
1934
1935    return progress;
1936 }
1937
1938 /**
1939  * Possibly returns an instruction that set up @param reg.
1940  *
1941  * Sometimes we want to take the result of some expression/variable
1942  * dereference tree and rewrite the instruction generating the result
1943  * of the tree.  When processing the tree, we know that the
1944  * instructions generated are all writing temporaries that are dead
1945  * outside of this tree.  So, if we have some instructions that write
1946  * a temporary, we're free to point that temp write somewhere else.
1947  *
1948  * Note that this doesn't guarantee that the instruction generated
1949  * only reg -- it might be the size=4 destination of a texture instruction.
1950  */
1951 fs_inst *
1952 fs_visitor::get_instruction_generating_reg(fs_inst *start,
1953                                            fs_inst *end,
1954                                            fs_reg reg)
1955 {
1956    if (end == start ||
1957        end->predicate ||
1958        end->force_uncompressed ||
1959        end->force_sechalf ||
1960        !reg.equals(end->dst)) {
1961       return NULL;
1962    } else {
1963       return end;
1964    }
1965 }
1966
1967 void
1968 fs_visitor::setup_payload_gen6()
1969 {
1970    struct intel_context *intel = &brw->intel;
1971    bool uses_depth =
1972       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
1973    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
1974
1975    assert(intel->gen >= 6);
1976
1977    /* R0-1: masks, pixel X/Y coordinates. */
1978    c->nr_payload_regs = 2;
1979    /* R2: only for 32-pixel dispatch.*/
1980
1981    /* R3-26: barycentric interpolation coordinates.  These appear in the
1982     * same order that they appear in the brw_wm_barycentric_interp_mode
1983     * enum.  Each set of coordinates occupies 2 registers if dispatch width
1984     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
1985     * appear if they were enabled using the "Barycentric Interpolation
1986     * Mode" bits in WM_STATE.
1987     */
1988    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1989       if (barycentric_interp_modes & (1 << i)) {
1990          c->barycentric_coord_reg[i] = c->nr_payload_regs;
1991          c->nr_payload_regs += 2;
1992          if (dispatch_width == 16) {
1993             c->nr_payload_regs += 2;
1994          }
1995       }
1996    }
1997
1998    /* R27: interpolated depth if uses source depth */
1999    if (uses_depth) {
2000       c->source_depth_reg = c->nr_payload_regs;
2001       c->nr_payload_regs++;
2002       if (dispatch_width == 16) {
2003          /* R28: interpolated depth if not 8-wide. */
2004          c->nr_payload_regs++;
2005       }
2006    }
2007    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2008    if (uses_depth) {
2009       c->source_w_reg = c->nr_payload_regs;
2010       c->nr_payload_regs++;
2011       if (dispatch_width == 16) {
2012          /* R30: interpolated W if not 8-wide. */
2013          c->nr_payload_regs++;
2014       }
2015    }
2016    /* R31: MSAA position offsets. */
2017    /* R32-: bary for 32-pixel. */
2018    /* R58-59: interp W for 32-pixel. */
2019
2020    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2021       c->source_depth_to_render_target = true;
2022    }
2023 }
2024
2025 bool
2026 fs_visitor::run()
2027 {
2028    uint32_t prog_offset_16 = 0;
2029    uint32_t orig_nr_params = c->prog_data.nr_params;
2030
2031    if (intel->gen >= 6)
2032       setup_payload_gen6();
2033    else
2034       setup_payload_gen4();
2035
2036    if (dispatch_width == 16) {
2037       /* We have to do a compaction pass now, or the one at the end of
2038        * execution will squash down where our prog_offset start needs
2039        * to be.
2040        */
2041       brw_compact_instructions(p);
2042
2043       /* align to 64 byte boundary. */
2044       while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
2045          brw_NOP(p);
2046       }
2047
2048       /* Save off the start of this 16-wide program in case we succeed. */
2049       prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
2050
2051       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2052    }
2053
2054    if (0) {
2055       emit_dummy_fs();
2056    } else {
2057       calculate_urb_setup();
2058       if (intel->gen < 6)
2059          emit_interpolation_setup_gen4();
2060       else
2061          emit_interpolation_setup_gen6();
2062
2063       /* Generate FS IR for main().  (the visitor only descends into
2064        * functions called "main").
2065        */
2066       if (shader) {
2067          foreach_list(node, &*shader->ir) {
2068             ir_instruction *ir = (ir_instruction *)node;
2069             base_ir = ir;
2070             this->result = reg_undef;
2071             ir->accept(this);
2072          }
2073       } else {
2074          emit_fragment_program_code();
2075       }
2076       if (failed)
2077          return false;
2078
2079       emit_fb_writes();
2080
2081       split_virtual_grfs();
2082
2083       setup_paramvalues_refs();
2084       setup_pull_constants();
2085
2086       bool progress;
2087       do {
2088          progress = false;
2089
2090          compact_virtual_grfs();
2091
2092          progress = remove_duplicate_mrf_writes() || progress;
2093
2094          progress = opt_algebraic() || progress;
2095          progress = opt_cse() || progress;
2096          progress = opt_copy_propagate() || progress;
2097          progress = dead_code_eliminate() || progress;
2098          progress = register_coalesce() || progress;
2099          progress = register_coalesce_2() || progress;
2100          progress = compute_to_mrf() || progress;
2101       } while (progress);
2102
2103       remove_dead_constants();
2104
2105       schedule_instructions();
2106
2107       assign_curb_setup();
2108       assign_urb_setup();
2109
2110       if (0) {
2111          /* Debug of register spilling: Go spill everything. */
2112          for (int i = 0; i < virtual_grf_count; i++) {
2113             spill_reg(i);
2114          }
2115       }
2116
2117       if (0)
2118          assign_regs_trivial();
2119       else {
2120          while (!assign_regs()) {
2121             if (failed)
2122                break;
2123          }
2124       }
2125    }
2126    assert(force_uncompressed_stack == 0);
2127    assert(force_sechalf_stack == 0);
2128
2129    if (failed)
2130       return false;
2131
2132    generate_code();
2133
2134    if (dispatch_width == 8) {
2135       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2136    } else {
2137       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2138       c->prog_data.prog_offset_16 = prog_offset_16;
2139
2140       /* Make sure we didn't try to sneak in an extra uniform */
2141       assert(orig_nr_params == c->prog_data.nr_params);
2142       (void) orig_nr_params;
2143    }
2144
2145    return !failed;
2146 }
2147
2148 bool
2149 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2150                struct gl_fragment_program *fp,
2151                struct gl_shader_program *prog)
2152 {
2153    struct intel_context *intel = &brw->intel;
2154    bool start_busy = false;
2155    float start_time = 0;
2156
2157    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2158       start_busy = (intel->batch.last_bo &&
2159                     drm_intel_bo_busy(intel->batch.last_bo));
2160       start_time = get_time();
2161    }
2162
2163    struct brw_shader *shader = NULL;
2164    if (prog)
2165       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2166
2167    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2168       if (shader) {
2169          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2170          _mesa_print_ir(shader->ir, NULL);
2171          printf("\n\n");
2172       } else {
2173          printf("ARB_fragment_program %d ir for native fragment shader\n",
2174                 fp->Base.Id);
2175          _mesa_print_program(&fp->Base);
2176       }
2177    }
2178
2179    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2180     */
2181    fs_visitor v(c, prog, fp, 8);
2182    if (!v.run()) {
2183       prog->LinkStatus = false;
2184       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2185
2186       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2187                     v.fail_msg);
2188
2189       return false;
2190    }
2191
2192    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2193       fs_visitor v2(c, prog, fp, 16);
2194       v2.import_uniforms(&v);
2195       if (!v2.run()) {
2196          perf_debug("16-wide shader failed to compile, falling back to "
2197                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2198       }
2199    }
2200
2201    c->prog_data.dispatch_width = 8;
2202
2203    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2204       if (shader->compiled_once)
2205          brw_wm_debug_recompile(brw, prog, &c->key);
2206       shader->compiled_once = true;
2207
2208       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2209          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2210                     (get_time() - start_time) * 1000);
2211       }
2212    }
2213
2214    return true;
2215 }
2216
2217 bool
2218 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2219 {
2220    struct brw_context *brw = brw_context(ctx);
2221    struct intel_context *intel = &brw->intel;
2222    struct brw_wm_prog_key key;
2223
2224    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2225       return true;
2226
2227    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2228       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2229    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2230    bool program_uses_dfdy = fp->UsesDFdy;
2231
2232    memset(&key, 0, sizeof(key));
2233
2234    if (intel->gen < 6) {
2235       if (fp->UsesKill)
2236          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2237
2238       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2239          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2240
2241       /* Just assume depth testing. */
2242       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2243       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2244    }
2245
2246    if (prog->Name != 0)
2247       key.proj_attrib_mask = 0xffffffff;
2248
2249    if (intel->gen < 6)
2250       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2251
2252    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2253       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2254          continue;
2255
2256       if (prog->Name == 0)
2257          key.proj_attrib_mask |= 1 << i;
2258
2259       if (intel->gen < 6) {
2260          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2261
2262          if (vp_index >= 0)
2263             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2264       }
2265    }
2266
2267    key.clamp_fragment_color = true;
2268
2269    for (int i = 0; i < MAX_SAMPLERS; i++) {
2270       if (fp->Base.ShadowSamplers & (1 << i)) {
2271          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2272          key.tex.swizzles[i] =
2273             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2274       } else {
2275          /* Color sampler: assume no swizzling. */
2276          key.tex.swizzles[i] = SWIZZLE_XYZW;
2277       }
2278    }
2279
2280    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2281       key.drawable_height = ctx->DrawBuffer->Height;
2282    }
2283
2284    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2285       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2286    }
2287
2288    key.nr_color_regions = 1;
2289
2290    key.program_string_id = bfp->id;
2291
2292    uint32_t old_prog_offset = brw->wm.prog_offset;
2293    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2294
2295    bool success = do_wm_prog(brw, prog, bfp, &key);
2296
2297    brw->wm.prog_offset = old_prog_offset;
2298    brw->wm.prog_data = old_prog_data;
2299
2300    return success;
2301 }