src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "program/prog_parameter.h"
  39 #include "program/prog_print.h"
  40 #include "program/register_allocate.h"
  41 #include "program/sampler.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_shader.h"
  48 #include "brw_fs.h"
  49 #include "../glsl/glsl_types.h"
  50 #include "../glsl/ir_print_visitor.h"
  51
  52 #define MAX_INSTRUCTION (1 << 30)
  53
  54 int
  55 fs_visitor::type_size(const struct glsl_type *type)
  56 {
  57    unsigned int size, i;
  58
  59    switch (type->base_type) {
  60    case GLSL_TYPE_UINT:
  61    case GLSL_TYPE_INT:
  62    case GLSL_TYPE_FLOAT:
  63    case GLSL_TYPE_BOOL:
  64       return type->components();
  65    case GLSL_TYPE_ARRAY:
  66       return type_size(type->fields.array) * type->length;
  67    case GLSL_TYPE_STRUCT:
  68       size = 0;
  69       for (i = 0; i < type->length; i++) {
  70          size += type_size(type->fields.structure[i].type);
  71       }
  72       return size;
  73    case GLSL_TYPE_SAMPLER:
  74       /* Samplers take up no register space, since they're baked in at
  75        * link time.
  76        */
  77       return 0;
  78    default:
  79       assert(!"not reached");
  80       return 0;
  81    }
  82 }
  83
  84 void
  85 fs_visitor::fail(const char *format, ...)
  86 {
  87    if (!failed) {
  88       failed = true;
  89
  90       if (INTEL_DEBUG & DEBUG_WM) {
  91          fprintf(stderr, "FS compile failed: ");
  92
  93          va_list va;
  94          va_start(va, format);
  95          vfprintf(stderr, format, va);
  96          va_end(va);
  97       }
  98    }
  99 }
 100
 101 void
 102 fs_visitor::push_force_uncompressed()
 103 {
 104    force_uncompressed_stack++;
 105 }
 106
 107 void
 108 fs_visitor::pop_force_uncompressed()
 109 {
 110    force_uncompressed_stack--;
 111    assert(force_uncompressed_stack >= 0);
 112 }
 113
 114 void
 115 fs_visitor::push_force_sechalf()
 116 {
 117    force_sechalf_stack++;
 118 }
 119
 120 void
 121 fs_visitor::pop_force_sechalf()
 122 {
 123    force_sechalf_stack--;
 124    assert(force_sechalf_stack >= 0);
 125 }
 126
 127 /**
 128  * Returns how many MRFs an FS opcode will write over.
 129  *
 130  * Note that this is not the 0 or 1 implied writes in an actual gen
 131  * instruction -- the FS opcodes often generate MOVs in addition.
 132  */
 133 int
 134 fs_visitor::implied_mrf_writes(fs_inst *inst)
 135 {
 136    if (inst->mlen == 0)
 137       return 0;
 138
 139    switch (inst->opcode) {
 140    case FS_OPCODE_RCP:
 141    case FS_OPCODE_RSQ:
 142    case FS_OPCODE_SQRT:
 143    case FS_OPCODE_EXP2:
 144    case FS_OPCODE_LOG2:
 145    case FS_OPCODE_SIN:
 146    case FS_OPCODE_COS:
 147       return 1 * c->dispatch_width / 8;
 148    case FS_OPCODE_POW:
 149       return 2 * c->dispatch_width / 8;
 150    case FS_OPCODE_TEX:
 151    case FS_OPCODE_TXB:
 152    case FS_OPCODE_TXD:
 153    case FS_OPCODE_TXL:
 154       return 1;
 155    case FS_OPCODE_FB_WRITE:
 156       return 2;
 157    case FS_OPCODE_PULL_CONSTANT_LOAD:
 158    case FS_OPCODE_UNSPILL:
 159       return 1;
 160    case FS_OPCODE_SPILL:
 161       return 2;
 162    default:
 163       assert(!"not reached");
 164       return inst->mlen;
 165    }
 166 }
 167
 168 int
 169 fs_visitor::virtual_grf_alloc(int size)
 170 {
 171    if (virtual_grf_array_size <= virtual_grf_next) {
 172       if (virtual_grf_array_size == 0)
 173          virtual_grf_array_size = 16;
 174       else
 175          virtual_grf_array_size *= 2;
 176       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 177                                    virtual_grf_array_size);
 178
 179       /* This slot is always unused. */
 180       virtual_grf_sizes[0] = 0;
 181    }
 182    virtual_grf_sizes[virtual_grf_next] = size;
 183    return virtual_grf_next++;
 184 }
 185
 186 /** Fixed HW reg constructor. */
 187 fs_reg::fs_reg(enum register_file file, int hw_reg)
 188 {
 189    init();
 190    this->file = file;
 191    this->hw_reg = hw_reg;
 192    this->type = BRW_REGISTER_TYPE_F;
 193 }
 194
 195 /** Fixed HW reg constructor. */
 196 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
 197 {
 198    init();
 199    this->file = file;
 200    this->hw_reg = hw_reg;
 201    this->type = type;
 202 }
 203
 204 /** Automatic reg constructor. */
 205 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 206 {
 207    init();
 208
 209    this->file = GRF;
 210    this->reg = v->virtual_grf_alloc(v->type_size(type));
 211    this->reg_offset = 0;
 212    this->type = brw_type_for_base_type(type);
 213 }
 214
 215 fs_reg *
 216 fs_visitor::variable_storage(ir_variable *var)
 217 {
 218    return (fs_reg *)hash_table_find(this->variable_ht, var);
 219 }
 220
 221 void
 222 import_uniforms_callback(const void *key,
 223                          void *data,
 224                          void *closure)
 225 {
 226    struct hash_table *dst_ht = (struct hash_table *)closure;
 227    const fs_reg *reg = (const fs_reg *)data;
 228
 229    if (reg->file != UNIFORM)
 230       return;
 231
 232    hash_table_insert(dst_ht, data, key);
 233 }
 234
 235 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 236  * This brings in those uniform definitions
 237  */
 238 void
 239 fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
 240 {
 241    hash_table_call_foreach(src_variable_ht,
 242                            import_uniforms_callback,
 243                            variable_ht);
 244 }
 245
 246 /* Our support for uniforms is piggy-backed on the struct
 247  * gl_fragment_program, because that's where the values actually
 248  * get stored, rather than in some global gl_shader_program uniform
 249  * store.
 250  */
 251 int
 252 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 253 {
 254    unsigned int offset = 0;
 255
 256    if (type->is_matrix()) {
 257       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 258                                                         type->vector_elements,
 259                                                         1);
 260
 261       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 262          offset += setup_uniform_values(loc + offset, column);
 263       }
 264
 265       return offset;
 266    }
 267
 268    switch (type->base_type) {
 269    case GLSL_TYPE_FLOAT:
 270    case GLSL_TYPE_UINT:
 271    case GLSL_TYPE_INT:
 272    case GLSL_TYPE_BOOL:
 273       for (unsigned int i = 0; i < type->vector_elements; i++) {
 274          unsigned int param = c->prog_data.nr_params++;
 275
 276          assert(param < ARRAY_SIZE(c->prog_data.param));
 277
 278          switch (type->base_type) {
 279          case GLSL_TYPE_FLOAT:
 280             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 281             break;
 282          case GLSL_TYPE_UINT:
 283             c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
 284             break;
 285          case GLSL_TYPE_INT:
 286             c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
 287             break;
 288          case GLSL_TYPE_BOOL:
 289             c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
 290             break;
 291          default:
 292             assert(!"not reached");
 293             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 294             break;
 295          }
 296          this->param_index[param] = loc;
 297          this->param_offset[param] = i;
 298       }
 299       return 1;
 300
 301    case GLSL_TYPE_STRUCT:
 302       for (unsigned int i = 0; i < type->length; i++) {
 303          offset += setup_uniform_values(loc + offset,
 304                                         type->fields.structure[i].type);
 305       }
 306       return offset;
 307
 308    case GLSL_TYPE_ARRAY:
 309       for (unsigned int i = 0; i < type->length; i++) {
 310          offset += setup_uniform_values(loc + offset, type->fields.array);
 311       }
 312       return offset;
 313
 314    case GLSL_TYPE_SAMPLER:
 315       /* The sampler takes up a slot, but we don't use any values from it. */
 316       return 1;
 317
 318    default:
 319       assert(!"not reached");
 320       return 0;
 321    }
 322 }
 323
 324
 325 /* Our support for builtin uniforms is even scarier than non-builtin.
 326  * It sits on top of the PROG_STATE_VAR parameters that are
 327  * automatically updated from GL context state.
 328  */
 329 void
 330 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 331 {
 332    const ir_state_slot *const slots = ir->state_slots;
 333    assert(ir->state_slots != NULL);
 334
 335    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 336       /* This state reference has already been setup by ir_to_mesa, but we'll
 337        * get the same index back here.
 338        */
 339       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 340                                             (gl_state_index *)slots[i].tokens);
 341
 342       /* Add each of the unique swizzles of the element as a parameter.
 343        * This'll end up matching the expected layout of the
 344        * array/matrix/structure we're trying to fill in.
 345        */
 346       int last_swiz = -1;
 347       for (unsigned int j = 0; j < 4; j++) {
 348          int swiz = GET_SWZ(slots[i].swizzle, j);
 349          if (swiz == last_swiz)
 350             break;
 351          last_swiz = swiz;
 352
 353          c->prog_data.param_convert[c->prog_data.nr_params] =
 354             PARAM_NO_CONVERT;
 355          this->param_index[c->prog_data.nr_params] = index;
 356          this->param_offset[c->prog_data.nr_params] = swiz;
 357          c->prog_data.nr_params++;
 358       }
 359    }
 360 }
 361
 362 fs_reg *
 363 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 364 {
 365    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 366    fs_reg wpos = *reg;
 367    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 368
 369    /* gl_FragCoord.x */
 370    if (ir->pixel_center_integer) {
 371       emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
 372    } else {
 373       emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
 374    }
 375    wpos.reg_offset++;
 376
 377    /* gl_FragCoord.y */
 378    if (!flip && ir->pixel_center_integer) {
 379       emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
 380    } else {
 381       fs_reg pixel_y = this->pixel_y;
 382       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 383
 384       if (flip) {
 385          pixel_y.negate = true;
 386          offset += c->key.drawable_height - 1.0;
 387       }
 388
 389       emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
 390    }
 391    wpos.reg_offset++;
 392
 393    /* gl_FragCoord.z */
 394    if (intel->gen >= 6) {
 395       emit(BRW_OPCODE_MOV, wpos,
 396            fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
 397    } else {
 398       emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
 399            interp_reg(FRAG_ATTRIB_WPOS, 2));
 400    }
 401    wpos.reg_offset++;
 402
 403    /* gl_FragCoord.w: Already set up in emit_interpolation */
 404    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 405
 406    return reg;
 407 }
 408
 409 fs_reg *
 410 fs_visitor::emit_general_interpolation(ir_variable *ir)
 411 {
 412    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 413    /* Interpolation is always in floating point regs. */
 414    reg->type = BRW_REGISTER_TYPE_F;
 415    fs_reg attr = *reg;
 416
 417    unsigned int array_elements;
 418    const glsl_type *type;
 419
 420    if (ir->type->is_array()) {
 421       array_elements = ir->type->length;
 422       if (array_elements == 0) {
 423          fail("dereferenced array '%s' has length 0\n", ir->name);
 424       }
 425       type = ir->type->fields.array;
 426    } else {
 427       array_elements = 1;
 428       type = ir->type;
 429    }
 430
 431    int location = ir->location;
 432    for (unsigned int i = 0; i < array_elements; i++) {
 433       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 434          if (urb_setup[location] == -1) {
 435             /* If there's no incoming setup data for this slot, don't
 436              * emit interpolation for it.
 437              */
 438             attr.reg_offset += type->vector_elements;
 439             location++;
 440             continue;
 441          }
 442
 443          bool is_gl_Color =
 444             location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
 445
 446          if (c->key.flat_shade && is_gl_Color) {
 447             /* Constant interpolation (flat shading) case. The SF has
 448              * handed us defined values in only the constant offset
 449              * field of the setup reg.
 450              */
 451             for (unsigned int k = 0; k < type->vector_elements; k++) {
 452                struct brw_reg interp = interp_reg(location, k);
 453                interp = suboffset(interp, 3);
 454                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 455                attr.reg_offset++;
 456             }
 457          } else {
 458             /* Perspective interpolation case. */
 459             for (unsigned int k = 0; k < type->vector_elements; k++) {
 460                struct brw_reg interp = interp_reg(location, k);
 461                emit(FS_OPCODE_LINTERP, attr,
 462                     this->delta_x, this->delta_y, fs_reg(interp));
 463                attr.reg_offset++;
 464             }
 465
 466             if (intel->gen < 6) {
 467                attr.reg_offset -= type->vector_elements;
 468                for (unsigned int k = 0; k < type->vector_elements; k++) {
 469                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 470                   attr.reg_offset++;
 471                }
 472             }
 473          }
 474          location++;
 475       }
 476    }
 477
 478    return reg;
 479 }
 480
 481 fs_reg *
 482 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 483 {
 484    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 485
 486    /* The frontfacing comes in as a bit in the thread payload. */
 487    if (intel->gen >= 6) {
 488       emit(BRW_OPCODE_ASR, *reg,
 489            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 490            fs_reg(15));
 491       emit(BRW_OPCODE_NOT, *reg, *reg);
 492       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 493    } else {
 494       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 495       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 496        * us front face
 497        */
 498       fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
 499                            fs_reg(r1_6ud),
 500                            fs_reg(1u << 31));
 501       inst->conditional_mod = BRW_CONDITIONAL_L;
 502       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 503    }
 504
 505    return reg;
 506 }
 507
 508 fs_inst *
 509 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
 510 {
 511    switch (opcode) {
 512    case FS_OPCODE_RCP:
 513    case FS_OPCODE_RSQ:
 514    case FS_OPCODE_SQRT:
 515    case FS_OPCODE_EXP2:
 516    case FS_OPCODE_LOG2:
 517    case FS_OPCODE_SIN:
 518    case FS_OPCODE_COS:
 519       break;
 520    default:
 521       assert(!"not reached: bad math opcode");
 522       return NULL;
 523    }
 524
 525    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 526     * might be able to do better by doing execsize = 1 math and then
 527     * expanding that result out, but we would need to be careful with
 528     * masking.
 529     *
 530     * The hardware ignores source modifiers (negate and abs) on math
 531     * instructions, so we also move to a temp to set those up.
 532     */
 533    if (intel->gen >= 6 && (src.file == UNIFORM ||
 534                            src.abs ||
 535                            src.negate)) {
 536       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 537       emit(BRW_OPCODE_MOV, expanded, src);
 538       src = expanded;
 539    }
 540
 541    fs_inst *inst = emit(opcode, dst, src);
 542
 543    if (intel->gen < 6) {
 544       inst->base_mrf = 2;
 545       inst->mlen = c->dispatch_width / 8;
 546    }
 547
 548    return inst;
 549 }
 550
 551 fs_inst *
 552 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 553 {
 554    int base_mrf = 2;
 555    fs_inst *inst;
 556
 557    assert(opcode == FS_OPCODE_POW);
 558
 559    if (intel->gen >= 6) {
 560       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 561        *
 562        * The hardware ignores source modifiers (negate and abs) on math
 563        * instructions, so we also move to a temp to set those up.
 564        */
 565       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 566          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 567          emit(BRW_OPCODE_MOV, expanded, src0);
 568          src0 = expanded;
 569       }
 570
 571       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 572          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 573          emit(BRW_OPCODE_MOV, expanded, src1);
 574          src1 = expanded;
 575       }
 576
 577       inst = emit(opcode, dst, src0, src1);
 578    } else {
 579       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
 580       inst = emit(opcode, dst, src0, reg_null_f);
 581
 582       inst->base_mrf = base_mrf;
 583       inst->mlen = 2 * c->dispatch_width / 8;
 584    }
 585    return inst;
 586 }
 587
 588 /**
 589  * To be called after the last _mesa_add_state_reference() call, to
 590  * set up prog_data.param[] for assign_curb_setup() and
 591  * setup_pull_constants().
 592  */
 593 void
 594 fs_visitor::setup_paramvalues_refs()
 595 {
 596    if (c->dispatch_width != 8)
 597       return;
 598
 599    /* Set up the pointers to ParamValues now that that array is finalized. */
 600    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
 601       c->prog_data.param[i] =
 602          fp->Base.Parameters->ParameterValues[this->param_index[i]] +
 603          this->param_offset[i];
 604    }
 605 }
 606
 607 void
 608 fs_visitor::assign_curb_setup()
 609 {
 610    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
 611    if (c->dispatch_width == 8) {
 612       c->prog_data.first_curbe_grf = c->nr_payload_regs;
 613    } else {
 614       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
 615    }
 616
 617    /* Map the offsets in the UNIFORM file to fixed HW regs. */
 618    foreach_iter(exec_list_iterator, iter, this->instructions) {
 619       fs_inst *inst = (fs_inst *)iter.get();
 620
 621       for (unsigned int i = 0; i < 3; i++) {
 622          if (inst->src[i].file == UNIFORM) {
 623             int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
 624             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
 625                                                   constant_nr / 8,
 626                                                   constant_nr % 8);
 627
 628             inst->src[i].file = FIXED_HW_REG;
 629             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
 630          }
 631       }
 632    }
 633 }
 634
 635 void
 636 fs_visitor::calculate_urb_setup()
 637 {
 638    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
 639       urb_setup[i] = -1;
 640    }
 641
 642    int urb_next = 0;
 643    /* Figure out where each of the incoming setup attributes lands. */
 644    if (intel->gen >= 6) {
 645       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
 646          if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
 647             urb_setup[i] = urb_next++;
 648          }
 649       }
 650    } else {
 651       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
 652       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
 653          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
 654             int fp_index;
 655
 656             if (i >= VERT_RESULT_VAR0)
 657                fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
 658             else if (i <= VERT_RESULT_TEX7)
 659                fp_index = i;
 660             else
 661                fp_index = -1;
 662
 663             if (fp_index >= 0)
 664                urb_setup[fp_index] = urb_next++;
 665          }
 666       }
 667    }
 668
 669    /* Each attribute is 4 setup channels, each of which is half a reg. */
 670    c->prog_data.urb_read_length = urb_next * 2;
 671 }
 672
 673 void
 674 fs_visitor::assign_urb_setup()
 675 {
 676    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
 677
 678    /* Offset all the urb_setup[] index by the actual position of the
 679     * setup regs, now that the location of the constants has been chosen.
 680     */
 681    foreach_iter(exec_list_iterator, iter, this->instructions) {
 682       fs_inst *inst = (fs_inst *)iter.get();
 683
 684       if (inst->opcode == FS_OPCODE_LINTERP) {
 685          assert(inst->src[2].file == FIXED_HW_REG);
 686          inst->src[2].fixed_hw_reg.nr += urb_start;
 687       }
 688
 689       if (inst->opcode == FS_OPCODE_CINTERP) {
 690          assert(inst->src[0].file == FIXED_HW_REG);
 691          inst->src[0].fixed_hw_reg.nr += urb_start;
 692       }
 693    }
 694
 695    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
 696 }
 697
 698 /**
 699  * Split large virtual GRFs into separate components if we can.
 700  *
 701  * This is mostly duplicated with what brw_fs_vector_splitting does,
 702  * but that's really conservative because it's afraid of doing
 703  * splitting that doesn't result in real progress after the rest of
 704  * the optimization phases, which would cause infinite looping in
 705  * optimization.  We can do it once here, safely.  This also has the
 706  * opportunity to split interpolated values, or maybe even uniforms,
 707  * which we don't have at the IR level.
 708  *
 709  * We want to split, because virtual GRFs are what we register
 710  * allocate and spill (due to contiguousness requirements for some
 711  * instructions), and they're what we naturally generate in the
 712  * codegen process, but most virtual GRFs don't actually need to be
 713  * contiguous sets of GRFs.  If we split, we'll end up with reduced
 714  * live intervals and better dead code elimination and coalescing.
 715  */
 716 void
 717 fs_visitor::split_virtual_grfs()
 718 {
 719    int num_vars = this->virtual_grf_next;
 720    bool split_grf[num_vars];
 721    int new_virtual_grf[num_vars];
 722
 723    /* Try to split anything > 0 sized. */
 724    for (int i = 0; i < num_vars; i++) {
 725       if (this->virtual_grf_sizes[i] != 1)
 726          split_grf[i] = true;
 727       else
 728          split_grf[i] = false;
 729    }
 730
 731    if (brw->has_pln) {
 732       /* PLN opcodes rely on the delta_xy being contiguous. */
 733       split_grf[this->delta_x.reg] = false;
 734    }
 735
 736    foreach_iter(exec_list_iterator, iter, this->instructions) {
 737       fs_inst *inst = (fs_inst *)iter.get();
 738
 739       /* Texturing produces 4 contiguous registers, so no splitting. */
 740       if (inst->is_tex()) {
 741          split_grf[inst->dst.reg] = false;
 742       }
 743    }
 744
 745    /* Allocate new space for split regs.  Note that the virtual
 746     * numbers will be contiguous.
 747     */
 748    for (int i = 0; i < num_vars; i++) {
 749       if (split_grf[i]) {
 750          new_virtual_grf[i] = virtual_grf_alloc(1);
 751          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
 752             int reg = virtual_grf_alloc(1);
 753             assert(reg == new_virtual_grf[i] + j - 1);
 754             (void) reg;
 755          }
 756          this->virtual_grf_sizes[i] = 1;
 757       }
 758    }
 759
 760    foreach_iter(exec_list_iterator, iter, this->instructions) {
 761       fs_inst *inst = (fs_inst *)iter.get();
 762
 763       if (inst->dst.file == GRF &&
 764           split_grf[inst->dst.reg] &&
 765           inst->dst.reg_offset != 0) {
 766          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
 767                           inst->dst.reg_offset - 1);
 768          inst->dst.reg_offset = 0;
 769       }
 770       for (int i = 0; i < 3; i++) {
 771          if (inst->src[i].file == GRF &&
 772              split_grf[inst->src[i].reg] &&
 773              inst->src[i].reg_offset != 0) {
 774             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
 775                                 inst->src[i].reg_offset - 1);
 776             inst->src[i].reg_offset = 0;
 777          }
 778       }
 779    }
 780    this->live_intervals_valid = false;
 781 }
 782
 783 /**
 784  * Choose accesses from the UNIFORM file to demote to using the pull
 785  * constant buffer.
 786  *
 787  * We allow a fragment shader to have more than the specified minimum
 788  * maximum number of fragment shader uniform components (64).  If
 789  * there are too many of these, they'd fill up all of register space.
 790  * So, this will push some of them out to the pull constant buffer and
 791  * update the program to load them.
 792  */
 793 void
 794 fs_visitor::setup_pull_constants()
 795 {
 796    /* Only allow 16 registers (128 uniform components) as push constants. */
 797    unsigned int max_uniform_components = 16 * 8;
 798    if (c->prog_data.nr_params <= max_uniform_components)
 799       return;
 800
 801    if (c->dispatch_width == 16) {
 802       fail("Pull constants not supported in 16-wide\n");
 803       return;
 804    }
 805
 806    /* Just demote the end of the list.  We could probably do better
 807     * here, demoting things that are rarely used in the program first.
 808     */
 809    int pull_uniform_base = max_uniform_components;
 810    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
 811
 812    foreach_iter(exec_list_iterator, iter, this->instructions) {
 813       fs_inst *inst = (fs_inst *)iter.get();
 814
 815       for (int i = 0; i < 3; i++) {
 816          if (inst->src[i].file != UNIFORM)
 817             continue;
 818
 819          int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
 820          if (uniform_nr < pull_uniform_base)
 821             continue;
 822
 823          fs_reg dst = fs_reg(this, glsl_type::float_type);
 824          fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
 825                                               dst);
 826          pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
 827          pull->ir = inst->ir;
 828          pull->annotation = inst->annotation;
 829          pull->base_mrf = 14;
 830          pull->mlen = 1;
 831
 832          inst->insert_before(pull);
 833
 834          inst->src[i].file = GRF;
 835          inst->src[i].reg = dst.reg;
 836          inst->src[i].reg_offset = 0;
 837          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
 838       }
 839    }
 840
 841    for (int i = 0; i < pull_uniform_count; i++) {
 842       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
 843       c->prog_data.pull_param_convert[i] =
 844          c->prog_data.param_convert[pull_uniform_base + i];
 845    }
 846    c->prog_data.nr_params -= pull_uniform_count;
 847    c->prog_data.nr_pull_params = pull_uniform_count;
 848 }
 849
 850 void
 851 fs_visitor::calculate_live_intervals()
 852 {
 853    int num_vars = this->virtual_grf_next;
 854    int *def = ralloc_array(mem_ctx, int, num_vars);
 855    int *use = ralloc_array(mem_ctx, int, num_vars);
 856    int loop_depth = 0;
 857    int loop_start = 0;
 858
 859    if (this->live_intervals_valid)
 860       return;
 861
 862    for (int i = 0; i < num_vars; i++) {
 863       def[i] = MAX_INSTRUCTION;
 864       use[i] = -1;
 865    }
 866
 867    int ip = 0;
 868    foreach_iter(exec_list_iterator, iter, this->instructions) {
 869       fs_inst *inst = (fs_inst *)iter.get();
 870
 871       if (inst->opcode == BRW_OPCODE_DO) {
 872          if (loop_depth++ == 0)
 873             loop_start = ip;
 874       } else if (inst->opcode == BRW_OPCODE_WHILE) {
 875          loop_depth--;
 876
 877          if (loop_depth == 0) {
 878             /* Patches up the use of vars marked for being live across
 879              * the whole loop.
 880              */
 881             for (int i = 0; i < num_vars; i++) {
 882                if (use[i] == loop_start) {
 883                   use[i] = ip;
 884                }
 885             }
 886          }
 887       } else {
 888          for (unsigned int i = 0; i < 3; i++) {
 889             if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
 890                int reg = inst->src[i].reg;
 891
 892                if (!loop_depth) {
 893                   use[reg] = ip;
 894                } else {
 895                   def[reg] = MIN2(loop_start, def[reg]);
 896                   use[reg] = loop_start;
 897
 898                   /* Nobody else is going to go smash our start to
 899                    * later in the loop now, because def[reg] now
 900                    * points before the bb header.
 901                    */
 902                }
 903             }
 904          }
 905          if (inst->dst.file == GRF && inst->dst.reg != 0) {
 906             int reg = inst->dst.reg;
 907
 908             if (!loop_depth) {
 909                def[reg] = MIN2(def[reg], ip);
 910             } else {
 911                def[reg] = MIN2(def[reg], loop_start);
 912             }
 913          }
 914       }
 915
 916       ip++;
 917    }
 918
 919    ralloc_free(this->virtual_grf_def);
 920    ralloc_free(this->virtual_grf_use);
 921    this->virtual_grf_def = def;
 922    this->virtual_grf_use = use;
 923
 924    this->live_intervals_valid = true;
 925 }
 926
 927 /**
 928  * Attempts to move immediate constants into the immediate
 929  * constant slot of following instructions.
 930  *
 931  * Immediate constants are a bit tricky -- they have to be in the last
 932  * operand slot, you can't do abs/negate on them,
 933  */
 934
 935 bool
 936 fs_visitor::propagate_constants()
 937 {
 938    bool progress = false;
 939
 940    calculate_live_intervals();
 941
 942    foreach_iter(exec_list_iterator, iter, this->instructions) {
 943       fs_inst *inst = (fs_inst *)iter.get();
 944
 945       if (inst->opcode != BRW_OPCODE_MOV ||
 946           inst->predicated ||
 947           inst->dst.file != GRF || inst->src[0].file != IMM ||
 948           inst->dst.type != inst->src[0].type ||
 949           (c->dispatch_width == 16 &&
 950            (inst->force_uncompressed || inst->force_sechalf)))
 951          continue;
 952
 953       /* Don't bother with cases where we should have had the
 954        * operation on the constant folded in GLSL already.
 955        */
 956       if (inst->saturate)
 957          continue;
 958
 959       /* Found a move of a constant to a GRF.  Find anything else using the GRF
 960        * before it's written, and replace it with the constant if we can.
 961        */
 962       exec_list_iterator scan_iter = iter;
 963       scan_iter.next();
 964       for (; scan_iter.has_next(); scan_iter.next()) {
 965          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
 966
 967          if (scan_inst->opcode == BRW_OPCODE_DO ||
 968              scan_inst->opcode == BRW_OPCODE_WHILE ||
 969              scan_inst->opcode == BRW_OPCODE_ELSE ||
 970              scan_inst->opcode == BRW_OPCODE_ENDIF) {
 971             break;
 972          }
 973
 974          for (int i = 2; i >= 0; i--) {
 975             if (scan_inst->src[i].file != GRF ||
 976                 scan_inst->src[i].reg != inst->dst.reg ||
 977                 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
 978                continue;
 979
 980             /* Don't bother with cases where we should have had the
 981              * operation on the constant folded in GLSL already.
 982              */
 983             if (scan_inst->src[i].negate || scan_inst->src[i].abs)
 984                continue;
 985
 986             switch (scan_inst->opcode) {
 987             case BRW_OPCODE_MOV:
 988                scan_inst->src[i] = inst->src[0];
 989                progress = true;
 990                break;
 991
 992             case BRW_OPCODE_MUL:
 993             case BRW_OPCODE_ADD:
 994                if (i == 1) {
 995                   scan_inst->src[i] = inst->src[0];
 996                   progress = true;
 997                } else if (i == 0 && scan_inst->src[1].file != IMM) {
 998                   /* Fit this constant in by commuting the operands */
 999                   scan_inst->src[0] = scan_inst->src[1];
1000                   scan_inst->src[1] = inst->src[0];
1001                   progress = true;
1002                }
1003                break;
1004
1005             case BRW_OPCODE_CMP:
1006                if (i == 1) {
1007                   scan_inst->src[i] = inst->src[0];
1008                   progress = true;
1009                } else if (i == 0 && scan_inst->src[1].file != IMM) {
1010                   uint32_t new_cmod;
1011
1012                   new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
1013                   if (new_cmod != ~0u) {
1014                      /* Fit this constant in by swapping the operands and
1015                       * flipping the test
1016                       */
1017                      scan_inst->src[0] = scan_inst->src[1];
1018                      scan_inst->src[1] = inst->src[0];
1019                      scan_inst->conditional_mod = new_cmod;
1020                      progress = true;
1021                   }
1022                }
1023                break;
1024
1025             case BRW_OPCODE_SEL:
1026                if (i == 1) {
1027                   scan_inst->src[i] = inst->src[0];
1028                   progress = true;
1029                } else if (i == 0 && scan_inst->src[1].file != IMM) {
1030                   /* Fit this constant in by swapping the operands and
1031                    * flipping the predicate
1032                    */
1033                   scan_inst->src[0] = scan_inst->src[1];
1034                   scan_inst->src[1] = inst->src[0];
1035                   scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
1036                   progress = true;
1037                }
1038                break;
1039             }
1040          }
1041
1042          if (scan_inst->dst.file == GRF &&
1043              scan_inst->dst.reg == inst->dst.reg &&
1044              (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1045               scan_inst->is_tex())) {
1046             break;
1047          }
1048       }
1049    }
1050
1051    if (progress)
1052        this->live_intervals_valid = false;
1053
1054    return progress;
1055 }
1056 /**
1057  * Must be called after calculate_live_intervales() to remove unused
1058  * writes to registers -- register allocation will fail otherwise
1059  * because something deffed but not used won't be considered to
1060  * interfere with other regs.
1061  */
1062 bool
1063 fs_visitor::dead_code_eliminate()
1064 {
1065    bool progress = false;
1066    int pc = 0;
1067
1068    calculate_live_intervals();
1069
1070    foreach_iter(exec_list_iterator, iter, this->instructions) {
1071       fs_inst *inst = (fs_inst *)iter.get();
1072
1073       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1074          inst->remove();
1075          progress = true;
1076       }
1077
1078       pc++;
1079    }
1080
1081    if (progress)
1082       live_intervals_valid = false;
1083
1084    return progress;
1085 }
1086
1087 bool
1088 fs_visitor::register_coalesce()
1089 {
1090    bool progress = false;
1091    int if_depth = 0;
1092    int loop_depth = 0;
1093
1094    foreach_iter(exec_list_iterator, iter, this->instructions) {
1095       fs_inst *inst = (fs_inst *)iter.get();
1096
1097       /* Make sure that we dominate the instructions we're going to
1098        * scan for interfering with our coalescing, or we won't have
1099        * scanned enough to see if anything interferes with our
1100        * coalescing.  We don't dominate the following instructions if
1101        * we're in a loop or an if block.
1102        */
1103       switch (inst->opcode) {
1104       case BRW_OPCODE_DO:
1105          loop_depth++;
1106          break;
1107       case BRW_OPCODE_WHILE:
1108          loop_depth--;
1109          break;
1110       case BRW_OPCODE_IF:
1111          if_depth++;
1112          break;
1113       case BRW_OPCODE_ENDIF:
1114          if_depth--;
1115          break;
1116       }
1117       if (loop_depth || if_depth)
1118          continue;
1119
1120       if (inst->opcode != BRW_OPCODE_MOV ||
1121           inst->predicated ||
1122           inst->saturate ||
1123           inst->dst.file != GRF || inst->src[0].file != GRF ||
1124           inst->dst.type != inst->src[0].type)
1125          continue;
1126
1127       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1128
1129       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1130        * them: check for no writes to either one until the exit of the
1131        * program.
1132        */
1133       bool interfered = false;
1134       exec_list_iterator scan_iter = iter;
1135       scan_iter.next();
1136       for (; scan_iter.has_next(); scan_iter.next()) {
1137          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
1138
1139          if (scan_inst->dst.file == GRF) {
1140             if (scan_inst->dst.reg == inst->dst.reg &&
1141                 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1142                  scan_inst->is_tex())) {
1143                interfered = true;
1144                break;
1145             }
1146             if (scan_inst->dst.reg == inst->src[0].reg &&
1147                 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
1148                  scan_inst->is_tex())) {
1149                interfered = true;
1150                break;
1151             }
1152          }
1153
1154          /* The gen6 MATH instruction can't handle source modifiers, so avoid
1155           * coalescing those for now.  We should do something more specific.
1156           */
1157          if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) {
1158             interfered = true;
1159             break;
1160          }
1161       }
1162       if (interfered) {
1163          continue;
1164       }
1165
1166       /* Rewrite the later usage to point at the source of the move to
1167        * be removed.
1168        */
1169       for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
1170            scan_iter.next()) {
1171          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
1172
1173          for (int i = 0; i < 3; i++) {
1174             if (scan_inst->src[i].file == GRF &&
1175                 scan_inst->src[i].reg == inst->dst.reg &&
1176                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1177                scan_inst->src[i].reg = inst->src[0].reg;
1178                scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
1179                scan_inst->src[i].abs |= inst->src[0].abs;
1180                scan_inst->src[i].negate ^= inst->src[0].negate;
1181                scan_inst->src[i].smear = inst->src[0].smear;
1182             }
1183          }
1184       }
1185
1186       inst->remove();
1187       progress = true;
1188    }
1189
1190    if (progress)
1191       live_intervals_valid = false;
1192
1193    return progress;
1194 }
1195
1196
1197 bool
1198 fs_visitor::compute_to_mrf()
1199 {
1200    bool progress = false;
1201    int next_ip = 0;
1202
1203    calculate_live_intervals();
1204
1205    foreach_iter(exec_list_iterator, iter, this->instructions) {
1206       fs_inst *inst = (fs_inst *)iter.get();
1207
1208       int ip = next_ip;
1209       next_ip++;
1210
1211       if (inst->opcode != BRW_OPCODE_MOV ||
1212           inst->predicated ||
1213           inst->dst.file != MRF || inst->src[0].file != GRF ||
1214           inst->dst.type != inst->src[0].type ||
1215           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1216          continue;
1217
1218       /* Work out which hardware MRF registers are written by this
1219        * instruction.
1220        */
1221       int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
1222       int mrf_high;
1223       if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
1224          mrf_high = mrf_low + 4;
1225       } else if (c->dispatch_width == 16 &&
1226                  (!inst->force_uncompressed && !inst->force_sechalf)) {
1227          mrf_high = mrf_low + 1;
1228       } else {
1229          mrf_high = mrf_low;
1230       }
1231
1232       /* Can't compute-to-MRF this GRF if someone else was going to
1233        * read it later.
1234        */
1235       if (this->virtual_grf_use[inst->src[0].reg] > ip)
1236          continue;
1237
1238       /* Found a move of a GRF to a MRF.  Let's see if we can go
1239        * rewrite the thing that made this GRF to write into the MRF.
1240        */
1241       fs_inst *scan_inst;
1242       for (scan_inst = (fs_inst *)inst->prev;
1243            scan_inst->prev != NULL;
1244            scan_inst = (fs_inst *)scan_inst->prev) {
1245          if (scan_inst->dst.file == GRF &&
1246              scan_inst->dst.reg == inst->src[0].reg) {
1247             /* Found the last thing to write our reg we want to turn
1248              * into a compute-to-MRF.
1249              */
1250
1251             if (scan_inst->is_tex()) {
1252                /* texturing writes several continuous regs, so we can't
1253                 * compute-to-mrf that.
1254                 */
1255                break;
1256             }
1257
1258             /* If it's predicated, it (probably) didn't populate all
1259              * the channels.  We might be able to rewrite everything
1260              * that writes that reg, but it would require smarter
1261              * tracking to delay the rewriting until complete success.
1262              */
1263             if (scan_inst->predicated)
1264                break;
1265
1266             /* If it's half of register setup and not the same half as
1267              * our MOV we're trying to remove, bail for now.
1268              */
1269             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1270                 scan_inst->force_sechalf != inst->force_sechalf) {
1271                break;
1272             }
1273
1274             /* SEND instructions can't have MRF as a destination. */
1275             if (scan_inst->mlen)
1276                break;
1277
1278             if (intel->gen >= 6) {
1279                /* gen6 math instructions must have the destination be
1280                 * GRF, so no compute-to-MRF for them.
1281                 */
1282                if (scan_inst->is_math()) {
1283                   break;
1284                }
1285             }
1286
1287             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1288                /* Found the creator of our MRF's source value. */
1289                scan_inst->dst.file = MRF;
1290                scan_inst->dst.hw_reg = inst->dst.hw_reg;
1291                scan_inst->saturate |= inst->saturate;
1292                inst->remove();
1293                progress = true;
1294             }
1295             break;
1296          }
1297
1298          /* We don't handle flow control here.  Most computation of
1299           * values that end up in MRFs are shortly before the MRF
1300           * write anyway.
1301           */
1302          if (scan_inst->opcode == BRW_OPCODE_DO ||
1303              scan_inst->opcode == BRW_OPCODE_WHILE ||
1304              scan_inst->opcode == BRW_OPCODE_ELSE ||
1305              scan_inst->opcode == BRW_OPCODE_ENDIF) {
1306             break;
1307          }
1308
1309          /* You can't read from an MRF, so if someone else reads our
1310           * MRF's source GRF that we wanted to rewrite, that stops us.
1311           */
1312          bool interfered = false;
1313          for (int i = 0; i < 3; i++) {
1314             if (scan_inst->src[i].file == GRF &&
1315                 scan_inst->src[i].reg == inst->src[0].reg &&
1316                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1317                interfered = true;
1318             }
1319          }
1320          if (interfered)
1321             break;
1322
1323          if (scan_inst->dst.file == MRF) {
1324             /* If somebody else writes our MRF here, we can't
1325              * compute-to-MRF before that.
1326              */
1327             int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
1328             int scan_mrf_high;
1329
1330             if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
1331                scan_mrf_high = scan_mrf_low + 4;
1332             } else if (c->dispatch_width == 16 &&
1333                        (!scan_inst->force_uncompressed &&
1334                         !scan_inst->force_sechalf)) {
1335                scan_mrf_high = scan_mrf_low + 1;
1336             } else {
1337                scan_mrf_high = scan_mrf_low;
1338             }
1339
1340             if (mrf_low == scan_mrf_low ||
1341                 mrf_low == scan_mrf_high ||
1342                 mrf_high == scan_mrf_low ||
1343                 mrf_high == scan_mrf_high) {
1344                break;
1345             }
1346          }
1347
1348          if (scan_inst->mlen > 0) {
1349             /* Found a SEND instruction, which means that there are
1350              * live values in MRFs from base_mrf to base_mrf +
1351              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1352              * above it.
1353              */
1354             if (mrf_low >= scan_inst->base_mrf &&
1355                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1356                break;
1357             }
1358             if (mrf_high >= scan_inst->base_mrf &&
1359                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1360                break;
1361             }
1362          }
1363       }
1364    }
1365
1366    return progress;
1367 }
1368
1369 /**
1370  * Walks through basic blocks, locking for repeated MRF writes and
1371  * removing the later ones.
1372  */
1373 bool
1374 fs_visitor::remove_duplicate_mrf_writes()
1375 {
1376    fs_inst *last_mrf_move[16];
1377    bool progress = false;
1378
1379    /* Need to update the MRF tracking for compressed instructions. */
1380    if (c->dispatch_width == 16)
1381       return false;
1382
1383    memset(last_mrf_move, 0, sizeof(last_mrf_move));
1384
1385    foreach_iter(exec_list_iterator, iter, this->instructions) {
1386       fs_inst *inst = (fs_inst *)iter.get();
1387
1388       switch (inst->opcode) {
1389       case BRW_OPCODE_DO:
1390       case BRW_OPCODE_WHILE:
1391       case BRW_OPCODE_IF:
1392       case BRW_OPCODE_ELSE:
1393       case BRW_OPCODE_ENDIF:
1394          memset(last_mrf_move, 0, sizeof(last_mrf_move));
1395          continue;
1396       default:
1397          break;
1398       }
1399
1400       if (inst->opcode == BRW_OPCODE_MOV &&
1401           inst->dst.file == MRF) {
1402          fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
1403          if (prev_inst && inst->equals(prev_inst)) {
1404             inst->remove();
1405             progress = true;
1406             continue;
1407          }
1408       }
1409
1410       /* Clear out the last-write records for MRFs that were overwritten. */
1411       if (inst->dst.file == MRF) {
1412          last_mrf_move[inst->dst.hw_reg] = NULL;
1413       }
1414
1415       if (inst->mlen > 0) {
1416          /* Found a SEND instruction, which will include two or fewer
1417           * implied MRF writes.  We could do better here.
1418           */
1419          for (int i = 0; i < implied_mrf_writes(inst); i++) {
1420             last_mrf_move[inst->base_mrf + i] = NULL;
1421          }
1422       }
1423
1424       /* Clear out any MRF move records whose sources got overwritten. */
1425       if (inst->dst.file == GRF) {
1426          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1427             if (last_mrf_move[i] &&
1428                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1429                last_mrf_move[i] = NULL;
1430             }
1431          }
1432       }
1433
1434       if (inst->opcode == BRW_OPCODE_MOV &&
1435           inst->dst.file == MRF &&
1436           inst->src[0].file == GRF &&
1437           !inst->predicated) {
1438          last_mrf_move[inst->dst.hw_reg] = inst;
1439       }
1440    }
1441
1442    return progress;
1443 }
1444
1445 bool
1446 fs_visitor::virtual_grf_interferes(int a, int b)
1447 {
1448    int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
1449    int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
1450
1451    /* We can't handle dead register writes here, without iterating
1452     * over the whole instruction stream to find every single dead
1453     * write to that register to compare to the live interval of the
1454     * other register.  Just assert that dead_code_eliminate() has been
1455     * called.
1456     */
1457    assert((this->virtual_grf_use[a] != -1 ||
1458            this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
1459           (this->virtual_grf_use[b] != -1 ||
1460            this->virtual_grf_def[b] == MAX_INSTRUCTION));
1461
1462    /* If the register is used to store 16 values of less than float
1463     * size (only the case for pixel_[xy]), then we can't allocate
1464     * another dword-sized thing to that register that would be used in
1465     * the same instruction.  This is because when the GPU decodes (for
1466     * example):
1467     *
1468     * (declare (in ) vec4 gl_FragCoord@0x97766a0)
1469     * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
1470     *
1471     * it's actually processed as:
1472     * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
1473     * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
1474     *
1475     * so our second half values in g6 got overwritten in the first
1476     * half.
1477     */
1478    if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
1479                                    this->pixel_x.reg == b ||
1480                                    this->pixel_y.reg == a ||
1481                                    this->pixel_y.reg == b)) {
1482       return start <= end;
1483    }
1484
1485    return start < end;
1486 }
1487
1488 bool
1489 fs_visitor::run()
1490 {
1491    uint32_t prog_offset_16 = 0;
1492    uint32_t orig_nr_params = c->prog_data.nr_params;
1493
1494    brw_wm_payload_setup(brw, c);
1495
1496    if (c->dispatch_width == 16) {
1497       /* align to 64 byte boundary. */
1498       while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
1499          brw_NOP(p);
1500       }
1501
1502       /* Save off the start of this 16-wide program in case we succeed. */
1503       prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
1504
1505       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1506    }
1507
1508    if (0) {
1509       emit_dummy_fs();
1510    } else {
1511       calculate_urb_setup();
1512       if (intel->gen < 6)
1513          emit_interpolation_setup_gen4();
1514       else
1515          emit_interpolation_setup_gen6();
1516
1517       /* Generate FS IR for main().  (the visitor only descends into
1518        * functions called "main").
1519        */
1520       foreach_iter(exec_list_iterator, iter, *shader->ir) {
1521          ir_instruction *ir = (ir_instruction *)iter.get();
1522          base_ir = ir;
1523          this->result = reg_undef;
1524          ir->accept(this);
1525       }
1526
1527       emit_fb_writes();
1528
1529       split_virtual_grfs();
1530
1531       setup_paramvalues_refs();
1532       setup_pull_constants();
1533
1534       bool progress;
1535       do {
1536          progress = false;
1537
1538          progress = remove_duplicate_mrf_writes() || progress;
1539
1540          progress = propagate_constants() || progress;
1541          progress = register_coalesce() || progress;
1542          progress = compute_to_mrf() || progress;
1543          progress = dead_code_eliminate() || progress;
1544       } while (progress);
1545
1546       schedule_instructions();
1547
1548       assign_curb_setup();
1549       assign_urb_setup();
1550
1551       if (0) {
1552          /* Debug of register spilling: Go spill everything. */
1553          int virtual_grf_count = virtual_grf_next;
1554          for (int i = 1; i < virtual_grf_count; i++) {
1555             spill_reg(i);
1556          }
1557       }
1558
1559       if (0)
1560          assign_regs_trivial();
1561       else {
1562          while (!assign_regs()) {
1563             if (failed)
1564                break;
1565          }
1566       }
1567    }
1568    assert(force_uncompressed_stack == 0);
1569    assert(force_sechalf_stack == 0);
1570
1571    if (failed)
1572       return false;
1573
1574    generate_code();
1575
1576    if (c->dispatch_width == 8) {
1577       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
1578    } else {
1579       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
1580       c->prog_data.prog_offset_16 = prog_offset_16;
1581
1582       /* Make sure we didn't try to sneak in an extra uniform */
1583       assert(orig_nr_params == c->prog_data.nr_params);
1584    }
1585
1586    return !failed;
1587 }
1588
1589 bool
1590 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
1591 {
1592    struct intel_context *intel = &brw->intel;
1593    struct gl_context *ctx = &intel->ctx;
1594    struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
1595
1596    if (!prog)
1597       return false;
1598
1599    struct brw_shader *shader =
1600      (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
1601    if (!shader)
1602       return false;
1603
1604    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1605       printf("GLSL IR for native fragment shader %d:\n", prog->Name);
1606       _mesa_print_ir(shader->ir, NULL);
1607       printf("\n\n");
1608    }
1609
1610    /* Now the main event: Visit the shader IR and generate our FS IR for it.
1611     */
1612    c->dispatch_width = 8;
1613
1614    fs_visitor v(c, shader);
1615    if (!v.run()) {
1616       /* FINISHME: Cleanly fail, test at link time, etc. */
1617       assert(!"not reached");
1618       return false;
1619    }
1620
1621    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
1622       c->dispatch_width = 16;
1623       fs_visitor v2(c, shader);
1624       v2.import_uniforms(v.variable_ht);
1625       v2.run();
1626    }
1627
1628    c->prog_data.dispatch_width = 8;
1629
1630    return true;
1631 }