src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 extern "C" {
  29
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "main/uniforms.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_optimize.h"
  38 #include "program/register_allocate.h"
  39 #include "program/sampler.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 }
  45 #include "brw_fs.h"
  46 #include "../glsl/glsl_types.h"
  47 #include "../glsl/ir_optimization.h"
  48 #include "../glsl/ir_print_visitor.h"
  49
  50 #define MAX_INSTRUCTION (1 << 30)
  51 static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
  52
  53 struct gl_shader *
  54 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
  55 {
  56    struct brw_shader *shader;
  57
  58    shader = rzalloc(NULL, struct brw_shader);
  59    if (shader) {
  60       shader->base.Type = type;
  61       shader->base.Name = name;
  62       _mesa_init_shader(ctx, &shader->base);
  63    }
  64
  65    return &shader->base;
  66 }
  67
  68 struct gl_shader_program *
  69 brw_new_shader_program(struct gl_context *ctx, GLuint name)
  70 {
  71    struct brw_shader_program *prog;
  72    prog = rzalloc(NULL, struct brw_shader_program);
  73    if (prog) {
  74       prog->base.Name = name;
  75       _mesa_init_shader_program(ctx, &prog->base);
  76    }
  77    return &prog->base;
  78 }
  79
  80 GLboolean
  81 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
  82 {
  83    struct brw_context *brw = brw_context(ctx);
  84    struct intel_context *intel = &brw->intel;
  85
  86    struct brw_shader *shader =
  87       (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
  88    if (shader != NULL) {
  89       void *mem_ctx = ralloc_context(NULL);
  90       bool progress;
  91
  92       if (shader->ir)
  93          ralloc_free(shader->ir);
  94       shader->ir = new(shader) exec_list;
  95       clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
  96
  97       do_mat_op_to_vec(shader->ir);
  98       lower_instructions(shader->ir,
  99                          MOD_TO_FRACT |
 100                          DIV_TO_MUL_RCP |
 101                          SUB_TO_ADD_NEG |
 102                          EXP_TO_EXP2 |
 103                          LOG_TO_LOG2);
 104
 105       /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
 106        * if-statements need to be flattened.
 107        */
 108       if (intel->gen < 6)
 109          lower_if_to_cond_assign(shader->ir, 16);
 110
 111       do_lower_texture_projection(shader->ir);
 112       do_vec_index_to_cond_assign(shader->ir);
 113       brw_do_cubemap_normalize(shader->ir);
 114       lower_noise(shader->ir);
 115       lower_quadop_vector(shader->ir, false);
 116       lower_variable_index_to_cond_assign(shader->ir,
 117                                           GL_TRUE, /* input */
 118                                           GL_TRUE, /* output */
 119                                           GL_TRUE, /* temp */
 120                                           GL_TRUE /* uniform */
 121                                           );
 122
 123       do {
 124          progress = false;
 125
 126          brw_do_channel_expressions(shader->ir);
 127          brw_do_vector_splitting(shader->ir);
 128
 129          progress = do_lower_jumps(shader->ir, true, true,
 130                                    true, /* main return */
 131                                    false, /* continue */
 132                                    false /* loops */
 133                                    ) || progress;
 134
 135          progress = do_common_optimization(shader->ir, true, 32) || progress;
 136       } while (progress);
 137
 138       validate_ir_tree(shader->ir);
 139
 140       reparent_ir(shader->ir, shader->ir);
 141       ralloc_free(mem_ctx);
 142    }
 143
 144    if (!_mesa_ir_link_shader(ctx, prog))
 145       return GL_FALSE;
 146
 147    return GL_TRUE;
 148 }
 149
 150 static int
 151 type_size(const struct glsl_type *type)
 152 {
 153    unsigned int size, i;
 154
 155    switch (type->base_type) {
 156    case GLSL_TYPE_UINT:
 157    case GLSL_TYPE_INT:
 158    case GLSL_TYPE_FLOAT:
 159    case GLSL_TYPE_BOOL:
 160       return type->components();
 161    case GLSL_TYPE_ARRAY:
 162       return type_size(type->fields.array) * type->length;
 163    case GLSL_TYPE_STRUCT:
 164       size = 0;
 165       for (i = 0; i < type->length; i++) {
 166          size += type_size(type->fields.structure[i].type);
 167       }
 168       return size;
 169    case GLSL_TYPE_SAMPLER:
 170       /* Samplers take up no register space, since they're baked in at
 171        * link time.
 172        */
 173       return 0;
 174    default:
 175       assert(!"not reached");
 176       return 0;
 177    }
 178 }
 179
 180 /**
 181  * Returns how many MRFs an FS opcode will write over.
 182  *
 183  * Note that this is not the 0 or 1 implied writes in an actual gen
 184  * instruction -- the FS opcodes often generate MOVs in addition.
 185  */
 186 int
 187 fs_visitor::implied_mrf_writes(fs_inst *inst)
 188 {
 189    if (inst->mlen == 0)
 190       return 0;
 191
 192    switch (inst->opcode) {
 193    case FS_OPCODE_RCP:
 194    case FS_OPCODE_RSQ:
 195    case FS_OPCODE_SQRT:
 196    case FS_OPCODE_EXP2:
 197    case FS_OPCODE_LOG2:
 198    case FS_OPCODE_SIN:
 199    case FS_OPCODE_COS:
 200       return 1;
 201    case FS_OPCODE_POW:
 202       return 2;
 203    case FS_OPCODE_TEX:
 204    case FS_OPCODE_TXB:
 205    case FS_OPCODE_TXD:
 206    case FS_OPCODE_TXL:
 207       return 1;
 208    case FS_OPCODE_FB_WRITE:
 209       return 2;
 210    case FS_OPCODE_PULL_CONSTANT_LOAD:
 211    case FS_OPCODE_UNSPILL:
 212       return 1;
 213    case FS_OPCODE_SPILL:
 214       return 2;
 215    default:
 216       assert(!"not reached");
 217       return inst->mlen;
 218    }
 219 }
 220
 221 int
 222 fs_visitor::virtual_grf_alloc(int size)
 223 {
 224    if (virtual_grf_array_size <= virtual_grf_next) {
 225       if (virtual_grf_array_size == 0)
 226          virtual_grf_array_size = 16;
 227       else
 228          virtual_grf_array_size *= 2;
 229       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 230                                    virtual_grf_array_size);
 231
 232       /* This slot is always unused. */
 233       virtual_grf_sizes[0] = 0;
 234    }
 235    virtual_grf_sizes[virtual_grf_next] = size;
 236    return virtual_grf_next++;
 237 }
 238
 239 /** Fixed HW reg constructor. */
 240 fs_reg::fs_reg(enum register_file file, int hw_reg)
 241 {
 242    init();
 243    this->file = file;
 244    this->hw_reg = hw_reg;
 245    this->type = BRW_REGISTER_TYPE_F;
 246 }
 247
 248 /** Fixed HW reg constructor. */
 249 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
 250 {
 251    init();
 252    this->file = file;
 253    this->hw_reg = hw_reg;
 254    this->type = type;
 255 }
 256
 257 int
 258 brw_type_for_base_type(const struct glsl_type *type)
 259 {
 260    switch (type->base_type) {
 261    case GLSL_TYPE_FLOAT:
 262       return BRW_REGISTER_TYPE_F;
 263    case GLSL_TYPE_INT:
 264    case GLSL_TYPE_BOOL:
 265       return BRW_REGISTER_TYPE_D;
 266    case GLSL_TYPE_UINT:
 267       return BRW_REGISTER_TYPE_UD;
 268    case GLSL_TYPE_ARRAY:
 269    case GLSL_TYPE_STRUCT:
 270    case GLSL_TYPE_SAMPLER:
 271       /* These should be overridden with the type of the member when
 272        * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
 273        * way to trip up if we don't.
 274        */
 275       return BRW_REGISTER_TYPE_UD;
 276    default:
 277       assert(!"not reached");
 278       return BRW_REGISTER_TYPE_F;
 279    }
 280 }
 281
 282 /** Automatic reg constructor. */
 283 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 284 {
 285    init();
 286
 287    this->file = GRF;
 288    this->reg = v->virtual_grf_alloc(type_size(type));
 289    this->reg_offset = 0;
 290    this->type = brw_type_for_base_type(type);
 291 }
 292
 293 fs_reg *
 294 fs_visitor::variable_storage(ir_variable *var)
 295 {
 296    return (fs_reg *)hash_table_find(this->variable_ht, var);
 297 }
 298
 299 /* Our support for uniforms is piggy-backed on the struct
 300  * gl_fragment_program, because that's where the values actually
 301  * get stored, rather than in some global gl_shader_program uniform
 302  * store.
 303  */
 304 int
 305 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 306 {
 307    unsigned int offset = 0;
 308
 309    if (type->is_matrix()) {
 310       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 311                                                         type->vector_elements,
 312                                                         1);
 313
 314       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 315          offset += setup_uniform_values(loc + offset, column);
 316       }
 317
 318       return offset;
 319    }
 320
 321    switch (type->base_type) {
 322    case GLSL_TYPE_FLOAT:
 323    case GLSL_TYPE_UINT:
 324    case GLSL_TYPE_INT:
 325    case GLSL_TYPE_BOOL:
 326       for (unsigned int i = 0; i < type->vector_elements; i++) {
 327          unsigned int param = c->prog_data.nr_params++;
 328
 329          assert(param < ARRAY_SIZE(c->prog_data.param));
 330
 331          switch (type->base_type) {
 332          case GLSL_TYPE_FLOAT:
 333             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 334             break;
 335          case GLSL_TYPE_UINT:
 336             c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
 337             break;
 338          case GLSL_TYPE_INT:
 339             c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
 340             break;
 341          case GLSL_TYPE_BOOL:
 342             c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
 343             break;
 344          default:
 345             assert(!"not reached");
 346             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 347             break;
 348          }
 349          this->param_index[param] = loc;
 350          this->param_offset[param] = i;
 351       }
 352       return 1;
 353
 354    case GLSL_TYPE_STRUCT:
 355       for (unsigned int i = 0; i < type->length; i++) {
 356          offset += setup_uniform_values(loc + offset,
 357                                         type->fields.structure[i].type);
 358       }
 359       return offset;
 360
 361    case GLSL_TYPE_ARRAY:
 362       for (unsigned int i = 0; i < type->length; i++) {
 363          offset += setup_uniform_values(loc + offset, type->fields.array);
 364       }
 365       return offset;
 366
 367    case GLSL_TYPE_SAMPLER:
 368       /* The sampler takes up a slot, but we don't use any values from it. */
 369       return 1;
 370
 371    default:
 372       assert(!"not reached");
 373       return 0;
 374    }
 375 }
 376
 377
 378 /* Our support for builtin uniforms is even scarier than non-builtin.
 379  * It sits on top of the PROG_STATE_VAR parameters that are
 380  * automatically updated from GL context state.
 381  */
 382 void
 383 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 384 {
 385    const struct gl_builtin_uniform_desc *statevar = NULL;
 386
 387    for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
 388       statevar = &_mesa_builtin_uniform_desc[i];
 389       if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
 390          break;
 391    }
 392
 393    if (!statevar->name) {
 394       this->fail = true;
 395       printf("Failed to find builtin uniform `%s'\n", ir->name);
 396       return;
 397    }
 398
 399    int array_count;
 400    if (ir->type->is_array()) {
 401       array_count = ir->type->length;
 402    } else {
 403       array_count = 1;
 404    }
 405
 406    for (int a = 0; a < array_count; a++) {
 407       for (unsigned int i = 0; i < statevar->num_elements; i++) {
 408          struct gl_builtin_uniform_element *element = &statevar->elements[i];
 409          int tokens[STATE_LENGTH];
 410
 411          memcpy(tokens, element->tokens, sizeof(element->tokens));
 412          if (ir->type->is_array()) {
 413             tokens[1] = a;
 414          }
 415
 416          /* This state reference has already been setup by ir_to_mesa,
 417           * but we'll get the same index back here.
 418           */
 419          int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 420                                                (gl_state_index *)tokens);
 421
 422          /* Add each of the unique swizzles of the element as a
 423           * parameter.  This'll end up matching the expected layout of
 424           * the array/matrix/structure we're trying to fill in.
 425           */
 426          int last_swiz = -1;
 427          for (unsigned int i = 0; i < 4; i++) {
 428             int swiz = GET_SWZ(element->swizzle, i);
 429             if (swiz == last_swiz)
 430                break;
 431             last_swiz = swiz;
 432
 433             c->prog_data.param_convert[c->prog_data.nr_params] =
 434                PARAM_NO_CONVERT;
 435             this->param_index[c->prog_data.nr_params] = index;
 436             this->param_offset[c->prog_data.nr_params] = swiz;
 437             c->prog_data.nr_params++;
 438          }
 439       }
 440    }
 441 }
 442
 443 fs_reg *
 444 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 445 {
 446    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 447    fs_reg wpos = *reg;
 448    fs_reg neg_y = this->pixel_y;
 449    neg_y.negate = true;
 450    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 451
 452    /* gl_FragCoord.x */
 453    if (ir->pixel_center_integer) {
 454       emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
 455    } else {
 456       emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
 457    }
 458    wpos.reg_offset++;
 459
 460    /* gl_FragCoord.y */
 461    if (!flip && ir->pixel_center_integer) {
 462       emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
 463    } else {
 464       fs_reg pixel_y = this->pixel_y;
 465       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 466
 467       if (flip) {
 468          pixel_y.negate = true;
 469          offset += c->key.drawable_height - 1.0;
 470       }
 471
 472       emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
 473    }
 474    wpos.reg_offset++;
 475
 476    /* gl_FragCoord.z */
 477    if (intel->gen >= 6) {
 478       emit(BRW_OPCODE_MOV, wpos,
 479            fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
 480    } else {
 481       emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
 482            interp_reg(FRAG_ATTRIB_WPOS, 2));
 483    }
 484    wpos.reg_offset++;
 485
 486    /* gl_FragCoord.w: Already set up in emit_interpolation */
 487    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 488
 489    return reg;
 490 }
 491
 492 fs_reg *
 493 fs_visitor::emit_general_interpolation(ir_variable *ir)
 494 {
 495    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 496    /* Interpolation is always in floating point regs. */
 497    reg->type = BRW_REGISTER_TYPE_F;
 498    fs_reg attr = *reg;
 499
 500    unsigned int array_elements;
 501    const glsl_type *type;
 502
 503    if (ir->type->is_array()) {
 504       array_elements = ir->type->length;
 505       if (array_elements == 0) {
 506          this->fail = true;
 507       }
 508       type = ir->type->fields.array;
 509    } else {
 510       array_elements = 1;
 511       type = ir->type;
 512    }
 513
 514    int location = ir->location;
 515    for (unsigned int i = 0; i < array_elements; i++) {
 516       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 517          if (urb_setup[location] == -1) {
 518             /* If there's no incoming setup data for this slot, don't
 519              * emit interpolation for it.
 520              */
 521             attr.reg_offset += type->vector_elements;
 522             location++;
 523             continue;
 524          }
 525
 526          if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 ||
 527                                    location == FRAG_ATTRIB_COL1)) {
 528             /* Constant interpolation (flat shading) case. The SF has
 529              * handed us defined values in only the constant offset
 530              * field of the setup reg.
 531              */
 532             for (unsigned int c = 0; c < type->vector_elements; c++) {
 533                struct brw_reg interp = interp_reg(location, c);
 534                interp = suboffset(interp, 3);
 535                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 536                attr.reg_offset++;
 537             }
 538          } else {
 539             /* Perspective interpolation case. */
 540             for (unsigned int c = 0; c < type->vector_elements; c++) {
 541                struct brw_reg interp = interp_reg(location, c);
 542                emit(FS_OPCODE_LINTERP, attr,
 543                     this->delta_x, this->delta_y, fs_reg(interp));
 544                attr.reg_offset++;
 545             }
 546
 547             if (intel->gen < 6) {
 548                attr.reg_offset -= type->vector_elements;
 549                for (unsigned int c = 0; c < type->vector_elements; c++) {
 550                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 551                   attr.reg_offset++;
 552                }
 553             }
 554          }
 555          location++;
 556       }
 557    }
 558
 559    return reg;
 560 }
 561
 562 fs_reg *
 563 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 564 {
 565    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 566
 567    /* The frontfacing comes in as a bit in the thread payload. */
 568    if (intel->gen >= 6) {
 569       emit(BRW_OPCODE_ASR, *reg,
 570            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 571            fs_reg(15));
 572       emit(BRW_OPCODE_NOT, *reg, *reg);
 573       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 574    } else {
 575       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 576       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 577        * us front face
 578        */
 579       fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
 580                            fs_reg(r1_6ud),
 581                            fs_reg(1u << 31));
 582       inst->conditional_mod = BRW_CONDITIONAL_L;
 583       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 584    }
 585
 586    return reg;
 587 }
 588
 589 fs_inst *
 590 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
 591 {
 592    switch (opcode) {
 593    case FS_OPCODE_RCP:
 594    case FS_OPCODE_RSQ:
 595    case FS_OPCODE_SQRT:
 596    case FS_OPCODE_EXP2:
 597    case FS_OPCODE_LOG2:
 598    case FS_OPCODE_SIN:
 599    case FS_OPCODE_COS:
 600       break;
 601    default:
 602       assert(!"not reached: bad math opcode");
 603       return NULL;
 604    }
 605
 606    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 607     * might be able to do better by doing execsize = 1 math and then
 608     * expanding that result out, but we would need to be careful with
 609     * masking.
 610     *
 611     * The hardware ignores source modifiers (negate and abs) on math
 612     * instructions, so we also move to a temp to set those up.
 613     */
 614    if (intel->gen >= 6 && (src.file == UNIFORM ||
 615                            src.abs ||
 616                            src.negate)) {
 617       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 618       emit(BRW_OPCODE_MOV, expanded, src);
 619       src = expanded;
 620    }
 621
 622    fs_inst *inst = emit(opcode, dst, src);
 623
 624    if (intel->gen < 6) {
 625       inst->base_mrf = 2;
 626       inst->mlen = 1;
 627    }
 628
 629    return inst;
 630 }
 631
 632 fs_inst *
 633 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 634 {
 635    int base_mrf = 2;
 636    fs_inst *inst;
 637
 638    assert(opcode == FS_OPCODE_POW);
 639
 640    if (intel->gen >= 6) {
 641       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 642        *
 643        * The hardware ignores source modifiers (negate and abs) on math
 644        * instructions, so we also move to a temp to set those up.
 645        */
 646       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 647          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 648          emit(BRW_OPCODE_MOV, expanded, src0);
 649          src0 = expanded;
 650       }
 651
 652       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 653          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 654          emit(BRW_OPCODE_MOV, expanded, src1);
 655          src1 = expanded;
 656       }
 657
 658       inst = emit(opcode, dst, src0, src1);
 659    } else {
 660       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
 661       inst = emit(opcode, dst, src0, reg_null_f);
 662
 663       inst->base_mrf = base_mrf;
 664       inst->mlen = 2;
 665    }
 666    return inst;
 667 }
 668
 669 void
 670 fs_visitor::visit(ir_variable *ir)
 671 {
 672    fs_reg *reg = NULL;
 673
 674    if (variable_storage(ir))
 675       return;
 676
 677    if (strcmp(ir->name, "gl_FragColor") == 0) {
 678       this->frag_color = ir;
 679    } else if (strcmp(ir->name, "gl_FragData") == 0) {
 680       this->frag_data = ir;
 681    } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
 682       this->frag_depth = ir;
 683    }
 684
 685    if (ir->mode == ir_var_in) {
 686       if (!strcmp(ir->name, "gl_FragCoord")) {
 687          reg = emit_fragcoord_interpolation(ir);
 688       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
 689          reg = emit_frontfacing_interpolation(ir);
 690       } else {
 691          reg = emit_general_interpolation(ir);
 692       }
 693       assert(reg);
 694       hash_table_insert(this->variable_ht, reg, ir);
 695       return;
 696    }
 697
 698    if (ir->mode == ir_var_uniform) {
 699       int param_index = c->prog_data.nr_params;
 700
 701       if (!strncmp(ir->name, "gl_", 3)) {
 702          setup_builtin_uniform_values(ir);
 703       } else {
 704          setup_uniform_values(ir->location, ir->type);
 705       }
 706
 707       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 708       reg->type = brw_type_for_base_type(ir->type);
 709    }
 710
 711    if (!reg)
 712       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 713
 714    hash_table_insert(this->variable_ht, reg, ir);
 715 }
 716
 717 void
 718 fs_visitor::visit(ir_dereference_variable *ir)
 719 {
 720    fs_reg *reg = variable_storage(ir->var);
 721    this->result = *reg;
 722 }
 723
 724 void
 725 fs_visitor::visit(ir_dereference_record *ir)
 726 {
 727    const glsl_type *struct_type = ir->record->type;
 728
 729    ir->record->accept(this);
 730
 731    unsigned int offset = 0;
 732    for (unsigned int i = 0; i < struct_type->length; i++) {
 733       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 734          break;
 735       offset += type_size(struct_type->fields.structure[i].type);
 736    }
 737    this->result.reg_offset += offset;
 738    this->result.type = brw_type_for_base_type(ir->type);
 739 }
 740
 741 void
 742 fs_visitor::visit(ir_dereference_array *ir)
 743 {
 744    ir_constant *index;
 745    int element_size;
 746
 747    ir->array->accept(this);
 748    index = ir->array_index->as_constant();
 749
 750    element_size = type_size(ir->type);
 751    this->result.type = brw_type_for_base_type(ir->type);
 752
 753    if (index) {
 754       assert(this->result.file == UNIFORM ||
 755              (this->result.file == GRF &&
 756               this->result.reg != 0));
 757       this->result.reg_offset += index->value.i[0] * element_size;
 758    } else {
 759       assert(!"FINISHME: non-constant array element");
 760    }
 761 }
 762
 763 /* Instruction selection: Produce a MOV.sat instead of
 764  * MIN(MAX(val, 0), 1) when possible.
 765  */
 766 bool
 767 fs_visitor::try_emit_saturate(ir_expression *ir)
 768 {
 769    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 770
 771    if (!sat_val)
 772       return false;
 773
 774    sat_val->accept(this);
 775    fs_reg src = this->result;
 776
 777    this->result = fs_reg(this, ir->type);
 778    fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
 779    inst->saturate = true;
 780
 781    return true;
 782 }
 783
 784 static uint32_t
 785 brw_conditional_for_comparison(unsigned int op)
 786 {
 787    switch (op) {
 788    case ir_binop_less:
 789       return BRW_CONDITIONAL_L;
 790    case ir_binop_greater:
 791       return BRW_CONDITIONAL_G;
 792    case ir_binop_lequal:
 793       return BRW_CONDITIONAL_LE;
 794    case ir_binop_gequal:
 795       return BRW_CONDITIONAL_GE;
 796    case ir_binop_equal:
 797    case ir_binop_all_equal: /* same as equal for scalars */
 798       return BRW_CONDITIONAL_Z;
 799    case ir_binop_nequal:
 800    case ir_binop_any_nequal: /* same as nequal for scalars */
 801       return BRW_CONDITIONAL_NZ;
 802    default:
 803       assert(!"not reached: bad operation for comparison");
 804       return BRW_CONDITIONAL_NZ;
 805    }
 806 }
 807
 808 void
 809 fs_visitor::visit(ir_expression *ir)
 810 {
 811    unsigned int operand;
 812    fs_reg op[2], temp;
 813    fs_inst *inst;
 814
 815    assert(ir->get_num_operands() <= 2);
 816
 817    if (try_emit_saturate(ir))
 818       return;
 819
 820    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 821       ir->operands[operand]->accept(this);
 822       if (this->result.file == BAD_FILE) {
 823          ir_print_visitor v;
 824          printf("Failed to get tree for expression operand:\n");
 825          ir->operands[operand]->accept(&v);
 826          this->fail = true;
 827       }
 828       op[operand] = this->result;
 829
 830       /* Matrix expression operands should have been broken down to vector
 831        * operations already.
 832        */
 833       assert(!ir->operands[operand]->type->is_matrix());
 834       /* And then those vector operands should have been broken down to scalar.
 835        */
 836       assert(!ir->operands[operand]->type->is_vector());
 837    }
 838
 839    /* Storage for our result.  If our result goes into an assignment, it will
 840     * just get copy-propagated out, so no worries.
 841     */
 842    this->result = fs_reg(this, ir->type);
 843
 844    switch (ir->operation) {
 845    case ir_unop_logic_not:
 846       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 847        * ones complement of the whole register, not just bit 0.
 848        */
 849       emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
 850       break;
 851    case ir_unop_neg:
 852       op[0].negate = !op[0].negate;
 853       this->result = op[0];
 854       break;
 855    case ir_unop_abs:
 856       op[0].abs = true;
 857       op[0].negate = false;
 858       this->result = op[0];
 859       break;
 860    case ir_unop_sign:
 861       temp = fs_reg(this, ir->type);
 862
 863       emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
 864
 865       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 866       inst->conditional_mod = BRW_CONDITIONAL_G;
 867       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
 868       inst->predicated = true;
 869
 870       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 871       inst->conditional_mod = BRW_CONDITIONAL_L;
 872       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
 873       inst->predicated = true;
 874
 875       break;
 876    case ir_unop_rcp:
 877       emit_math(FS_OPCODE_RCP, this->result, op[0]);
 878       break;
 879
 880    case ir_unop_exp2:
 881       emit_math(FS_OPCODE_EXP2, this->result, op[0]);
 882       break;
 883    case ir_unop_log2:
 884       emit_math(FS_OPCODE_LOG2, this->result, op[0]);
 885       break;
 886    case ir_unop_exp:
 887    case ir_unop_log:
 888       assert(!"not reached: should be handled by ir_explog_to_explog2");
 889       break;
 890    case ir_unop_sin:
 891    case ir_unop_sin_reduced:
 892       emit_math(FS_OPCODE_SIN, this->result, op[0]);
 893       break;
 894    case ir_unop_cos:
 895    case ir_unop_cos_reduced:
 896       emit_math(FS_OPCODE_COS, this->result, op[0]);
 897       break;
 898
 899    case ir_unop_dFdx:
 900       emit(FS_OPCODE_DDX, this->result, op[0]);
 901       break;
 902    case ir_unop_dFdy:
 903       emit(FS_OPCODE_DDY, this->result, op[0]);
 904       break;
 905
 906    case ir_binop_add:
 907       emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
 908       break;
 909    case ir_binop_sub:
 910       assert(!"not reached: should be handled by ir_sub_to_add_neg");
 911       break;
 912
 913    case ir_binop_mul:
 914       emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
 915       break;
 916    case ir_binop_div:
 917       assert(!"not reached: should be handled by ir_div_to_mul_rcp");
 918       break;
 919    case ir_binop_mod:
 920       assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
 921       break;
 922
 923    case ir_binop_less:
 924    case ir_binop_greater:
 925    case ir_binop_lequal:
 926    case ir_binop_gequal:
 927    case ir_binop_equal:
 928    case ir_binop_all_equal:
 929    case ir_binop_nequal:
 930    case ir_binop_any_nequal:
 931       temp = this->result;
 932       /* original gen4 does implicit conversion before comparison. */
 933       if (intel->gen < 5)
 934          temp.type = op[0].type;
 935
 936       inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
 937       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
 938       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
 939       break;
 940
 941    case ir_binop_logic_xor:
 942       emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
 943       break;
 944
 945    case ir_binop_logic_or:
 946       emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
 947       break;
 948
 949    case ir_binop_logic_and:
 950       emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
 951       break;
 952
 953    case ir_binop_dot:
 954    case ir_unop_any:
 955       assert(!"not reached: should be handled by brw_fs_channel_expressions");
 956       break;
 957
 958    case ir_unop_noise:
 959       assert(!"not reached: should be handled by lower_noise");
 960       break;
 961
 962    case ir_quadop_vector:
 963       assert(!"not reached: should be handled by lower_quadop_vector");
 964       break;
 965
 966    case ir_unop_sqrt:
 967       emit_math(FS_OPCODE_SQRT, this->result, op[0]);
 968       break;
 969
 970    case ir_unop_rsq:
 971       emit_math(FS_OPCODE_RSQ, this->result, op[0]);
 972       break;
 973
 974    case ir_unop_i2f:
 975    case ir_unop_b2f:
 976    case ir_unop_b2i:
 977    case ir_unop_f2i:
 978       emit(BRW_OPCODE_MOV, this->result, op[0]);
 979       break;
 980    case ir_unop_f2b:
 981    case ir_unop_i2b:
 982       temp = this->result;
 983       /* original gen4 does implicit conversion before comparison. */
 984       if (intel->gen < 5)
 985          temp.type = op[0].type;
 986
 987       inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
 988       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 989       inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
 990       break;
 991
 992    case ir_unop_trunc:
 993       emit(BRW_OPCODE_RNDZ, this->result, op[0]);
 994       break;
 995    case ir_unop_ceil:
 996       op[0].negate = !op[0].negate;
 997       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
 998       this->result.negate = true;
 999       break;
1000    case ir_unop_floor:
1001       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1002       break;
1003    case ir_unop_fract:
1004       inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
1005       break;
1006    case ir_unop_round_even:
1007       emit(BRW_OPCODE_RNDE, this->result, op[0]);
1008       break;
1009
1010    case ir_binop_min:
1011       inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1012       inst->conditional_mod = BRW_CONDITIONAL_L;
1013
1014       inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1015       inst->predicated = true;
1016       break;
1017    case ir_binop_max:
1018       inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1019       inst->conditional_mod = BRW_CONDITIONAL_G;
1020
1021       inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1022       inst->predicated = true;
1023       break;
1024
1025    case ir_binop_pow:
1026       emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1027       break;
1028
1029    case ir_unop_bit_not:
1030       inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1031       break;
1032    case ir_binop_bit_and:
1033       inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1034       break;
1035    case ir_binop_bit_xor:
1036       inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1037       break;
1038    case ir_binop_bit_or:
1039       inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1040       break;
1041
1042    case ir_unop_u2f:
1043    case ir_binop_lshift:
1044    case ir_binop_rshift:
1045       assert(!"GLSL 1.30 features unsupported");
1046       break;
1047    }
1048 }
1049
1050 void
1051 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1052                                    const glsl_type *type, bool predicated)
1053 {
1054    switch (type->base_type) {
1055    case GLSL_TYPE_FLOAT:
1056    case GLSL_TYPE_UINT:
1057    case GLSL_TYPE_INT:
1058    case GLSL_TYPE_BOOL:
1059       for (unsigned int i = 0; i < type->components(); i++) {
1060          l.type = brw_type_for_base_type(type);
1061          r.type = brw_type_for_base_type(type);
1062
1063          fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1064          inst->predicated = predicated;
1065
1066          l.reg_offset++;
1067          r.reg_offset++;
1068       }
1069       break;
1070    case GLSL_TYPE_ARRAY:
1071       for (unsigned int i = 0; i < type->length; i++) {
1072          emit_assignment_writes(l, r, type->fields.array, predicated);
1073       }
1074       break;
1075
1076    case GLSL_TYPE_STRUCT:
1077       for (unsigned int i = 0; i < type->length; i++) {
1078          emit_assignment_writes(l, r, type->fields.structure[i].type,
1079                                 predicated);
1080       }
1081       break;
1082
1083    case GLSL_TYPE_SAMPLER:
1084       break;
1085
1086    default:
1087       assert(!"not reached");
1088       break;
1089    }
1090 }
1091
1092 void
1093 fs_visitor::visit(ir_assignment *ir)
1094 {
1095    struct fs_reg l, r;
1096    fs_inst *inst;
1097
1098    /* FINISHME: arrays on the lhs */
1099    ir->lhs->accept(this);
1100    l = this->result;
1101
1102    ir->rhs->accept(this);
1103    r = this->result;
1104
1105    assert(l.file != BAD_FILE);
1106    assert(r.file != BAD_FILE);
1107
1108    if (ir->condition) {
1109       emit_bool_to_cond_code(ir->condition);
1110    }
1111
1112    if (ir->lhs->type->is_scalar() ||
1113        ir->lhs->type->is_vector()) {
1114       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1115          if (ir->write_mask & (1 << i)) {
1116             inst = emit(BRW_OPCODE_MOV, l, r);
1117             if (ir->condition)
1118                inst->predicated = true;
1119             r.reg_offset++;
1120          }
1121          l.reg_offset++;
1122       }
1123    } else {
1124       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1125    }
1126 }
1127
1128 fs_inst *
1129 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1130 {
1131    int mlen;
1132    int base_mrf = 1;
1133    bool simd16 = false;
1134    fs_reg orig_dst;
1135
1136    /* g0 header. */
1137    mlen = 1;
1138
1139    if (ir->shadow_comparitor) {
1140       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1141          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1142          coordinate.reg_offset++;
1143       }
1144       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1145       mlen += 3;
1146
1147       if (ir->op == ir_tex) {
1148          /* There's no plain shadow compare message, so we use shadow
1149           * compare with a bias of 0.0.
1150           */
1151          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1152          mlen++;
1153       } else if (ir->op == ir_txb) {
1154          ir->lod_info.bias->accept(this);
1155          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1156          mlen++;
1157       } else {
1158          assert(ir->op == ir_txl);
1159          ir->lod_info.lod->accept(this);
1160          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1161          mlen++;
1162       }
1163
1164       ir->shadow_comparitor->accept(this);
1165       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1166       mlen++;
1167    } else if (ir->op == ir_tex) {
1168       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1169          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1170          coordinate.reg_offset++;
1171       }
1172       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1173       mlen += 3;
1174    } else if (ir->op == ir_txd) {
1175       assert(!"TXD isn't supported on gen4 yet.");
1176    } else {
1177       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1178        * instructions.  We'll need to do SIMD16 here.
1179        */
1180       assert(ir->op == ir_txb || ir->op == ir_txl);
1181
1182       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1183          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1184          coordinate.reg_offset++;
1185       }
1186
1187       /* lod/bias appears after u/v/r. */
1188       mlen += 6;
1189
1190       if (ir->op == ir_txb) {
1191          ir->lod_info.bias->accept(this);
1192          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1193          mlen++;
1194       } else {
1195          ir->lod_info.lod->accept(this);
1196          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1197          mlen++;
1198       }
1199
1200       /* The unused upper half. */
1201       mlen++;
1202
1203       /* Now, since we're doing simd16, the return is 2 interleaved
1204        * vec4s where the odd-indexed ones are junk. We'll need to move
1205        * this weirdness around to the expected layout.
1206        */
1207       simd16 = true;
1208       orig_dst = dst;
1209       dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1210                                                        2));
1211       dst.type = BRW_REGISTER_TYPE_F;
1212    }
1213
1214    fs_inst *inst = NULL;
1215    switch (ir->op) {
1216    case ir_tex:
1217       inst = emit(FS_OPCODE_TEX, dst);
1218       break;
1219    case ir_txb:
1220       inst = emit(FS_OPCODE_TXB, dst);
1221       break;
1222    case ir_txl:
1223       inst = emit(FS_OPCODE_TXL, dst);
1224       break;
1225    case ir_txd:
1226       inst = emit(FS_OPCODE_TXD, dst);
1227       break;
1228    case ir_txf:
1229       assert(!"GLSL 1.30 features unsupported");
1230       break;
1231    }
1232    inst->base_mrf = base_mrf;
1233    inst->mlen = mlen;
1234
1235    if (simd16) {
1236       for (int i = 0; i < 4; i++) {
1237          emit(BRW_OPCODE_MOV, orig_dst, dst);
1238          orig_dst.reg_offset++;
1239          dst.reg_offset += 2;
1240       }
1241    }
1242
1243    return inst;
1244 }
1245
1246 fs_inst *
1247 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1248 {
1249    /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1250     * optional parameters like shadow comparitor or LOD bias.  If
1251     * optional parameters aren't present, those base slots are
1252     * optional and don't need to be included in the message.
1253     *
1254     * We don't fill in the unnecessary slots regardless, which may
1255     * look surprising in the disassembly.
1256     */
1257    int mlen = 1; /* g0 header always present. */
1258    int base_mrf = 1;
1259
1260    for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1261       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1262       coordinate.reg_offset++;
1263    }
1264    mlen += ir->coordinate->type->vector_elements;
1265
1266    if (ir->shadow_comparitor) {
1267       mlen = MAX2(mlen, 5);
1268
1269       ir->shadow_comparitor->accept(this);
1270       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1271       mlen++;
1272    }
1273
1274    fs_inst *inst = NULL;
1275    switch (ir->op) {
1276    case ir_tex:
1277       inst = emit(FS_OPCODE_TEX, dst);
1278       break;
1279    case ir_txb:
1280       ir->lod_info.bias->accept(this);
1281       mlen = MAX2(mlen, 5);
1282       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1283       mlen++;
1284
1285       inst = emit(FS_OPCODE_TXB, dst);
1286       break;
1287    case ir_txl:
1288       ir->lod_info.lod->accept(this);
1289       mlen = MAX2(mlen, 5);
1290       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1291       mlen++;
1292
1293       inst = emit(FS_OPCODE_TXL, dst);
1294       break;
1295    case ir_txd:
1296    case ir_txf:
1297       assert(!"GLSL 1.30 features unsupported");
1298       break;
1299    }
1300    inst->base_mrf = base_mrf;
1301    inst->mlen = mlen;
1302
1303    return inst;
1304 }
1305
1306 void
1307 fs_visitor::visit(ir_texture *ir)
1308 {
1309    int sampler;
1310    fs_inst *inst = NULL;
1311
1312    ir->coordinate->accept(this);
1313    fs_reg coordinate = this->result;
1314
1315    if (ir->offset != NULL) {
1316       ir_constant *offset = ir->offset->as_constant();
1317       assert(offset != NULL);
1318
1319       signed char offsets[3];
1320       for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1321          offsets[i] = (signed char) offset->value.i[i];
1322
1323       /* Combine all three offsets into a single unsigned dword:
1324        *
1325        *    bits 11:8 - U Offset (X component)
1326        *    bits  7:4 - V Offset (Y component)
1327        *    bits  3:0 - R Offset (Z component)
1328        */
1329       unsigned offset_bits = 0;
1330       for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1331          const unsigned shift = 4 * (2 - i);
1332          offset_bits |= (offsets[i] << shift) & (0xF << shift);
1333       }
1334
1335       /* Explicitly set up the message header by copying g0 to msg reg m1. */
1336       emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1337            fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1338
1339       /* Then set the offset bits in DWord 2 of the message header. */
1340       emit(BRW_OPCODE_MOV,
1341            fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1342                          BRW_REGISTER_TYPE_UD)),
1343            fs_reg(brw_imm_uw(offset_bits)));
1344    }
1345
1346    /* Should be lowered by do_lower_texture_projection */
1347    assert(!ir->projector);
1348
1349    sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1350                                              ctx->Shader.CurrentFragmentProgram,
1351                                              &brw->fragment_program->Base);
1352    sampler = c->fp->program.Base.SamplerUnits[sampler];
1353
1354    /* The 965 requires the EU to do the normalization of GL rectangle
1355     * texture coordinates.  We use the program parameter state
1356     * tracking to get the scaling factor.
1357     */
1358    if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1359       struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1360       int tokens[STATE_LENGTH] = {
1361          STATE_INTERNAL,
1362          STATE_TEXRECT_SCALE,
1363          sampler,
1364          0,
1365          0
1366       };
1367
1368       c->prog_data.param_convert[c->prog_data.nr_params] =
1369          PARAM_NO_CONVERT;
1370       c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1371          PARAM_NO_CONVERT;
1372
1373       fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1374       fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1375       GLuint index = _mesa_add_state_reference(params,
1376                                                (gl_state_index *)tokens);
1377
1378       this->param_index[c->prog_data.nr_params] = index;
1379       this->param_offset[c->prog_data.nr_params] = 0;
1380       c->prog_data.nr_params++;
1381       this->param_index[c->prog_data.nr_params] = index;
1382       this->param_offset[c->prog_data.nr_params] = 1;
1383       c->prog_data.nr_params++;
1384
1385       fs_reg dst = fs_reg(this, ir->coordinate->type);
1386       fs_reg src = coordinate;
1387       coordinate = dst;
1388
1389       emit(BRW_OPCODE_MUL, dst, src, scale_x);
1390       dst.reg_offset++;
1391       src.reg_offset++;
1392       emit(BRW_OPCODE_MUL, dst, src, scale_y);
1393    }
1394
1395    /* Writemasking doesn't eliminate channels on SIMD8 texture
1396     * samples, so don't worry about them.
1397     */
1398    fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1399
1400    if (intel->gen < 5) {
1401       inst = emit_texture_gen4(ir, dst, coordinate);
1402    } else {
1403       inst = emit_texture_gen5(ir, dst, coordinate);
1404    }
1405
1406    /* If there's an offset, we already set up m1.  To avoid the implied move,
1407     * use the null register.  Otherwise, we want an implied move from g0.
1408     */
1409    if (ir->offset != NULL)
1410       inst->src[0] = fs_reg(brw_null_reg());
1411    else
1412       inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1413
1414    inst->sampler = sampler;
1415
1416    this->result = dst;
1417
1418    if (ir->shadow_comparitor)
1419       inst->shadow_compare = true;
1420
1421    if (ir->type == glsl_type::float_type) {
1422       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1423       assert(ir->sampler->type->sampler_shadow);
1424    } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1425       fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1426
1427       for (int i = 0; i < 4; i++) {
1428          int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1429          fs_reg l = swizzle_dst;
1430          l.reg_offset += i;
1431
1432          if (swiz == SWIZZLE_ZERO) {
1433             emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1434          } else if (swiz == SWIZZLE_ONE) {
1435             emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1436          } else {
1437             fs_reg r = dst;
1438             r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1439             emit(BRW_OPCODE_MOV, l, r);
1440          }
1441       }
1442       this->result = swizzle_dst;
1443    }
1444 }
1445
1446 void
1447 fs_visitor::visit(ir_swizzle *ir)
1448 {
1449    ir->val->accept(this);
1450    fs_reg val = this->result;
1451
1452    if (ir->type->vector_elements == 1) {
1453       this->result.reg_offset += ir->mask.x;
1454       return;
1455    }
1456
1457    fs_reg result = fs_reg(this, ir->type);
1458    this->result = result;
1459
1460    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1461       fs_reg channel = val;
1462       int swiz = 0;
1463
1464       switch (i) {
1465       case 0:
1466          swiz = ir->mask.x;
1467          break;
1468       case 1:
1469          swiz = ir->mask.y;
1470          break;
1471       case 2:
1472          swiz = ir->mask.z;
1473          break;
1474       case 3:
1475          swiz = ir->mask.w;
1476          break;
1477       }
1478
1479       channel.reg_offset += swiz;
1480       emit(BRW_OPCODE_MOV, result, channel);
1481       result.reg_offset++;
1482    }
1483 }
1484
1485 void
1486 fs_visitor::visit(ir_discard *ir)
1487 {
1488    fs_reg temp = fs_reg(this, glsl_type::uint_type);
1489
1490    assert(ir->condition == NULL); /* FINISHME */
1491
1492    emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1493    emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1494    kill_emitted = true;
1495 }
1496
1497 void
1498 fs_visitor::visit(ir_constant *ir)
1499 {
1500    /* Set this->result to reg at the bottom of the function because some code
1501     * paths will cause this visitor to be applied to other fields.  This will
1502     * cause the value stored in this->result to be modified.
1503     *
1504     * Make reg constant so that it doesn't get accidentally modified along the
1505     * way.  Yes, I actually had this problem. :(
1506     */
1507    const fs_reg reg(this, ir->type);
1508    fs_reg dst_reg = reg;
1509
1510    if (ir->type->is_array()) {
1511       const unsigned size = type_size(ir->type->fields.array);
1512
1513       for (unsigned i = 0; i < ir->type->length; i++) {
1514          ir->array_elements[i]->accept(this);
1515          fs_reg src_reg = this->result;
1516
1517          dst_reg.type = src_reg.type;
1518          for (unsigned j = 0; j < size; j++) {
1519             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1520             src_reg.reg_offset++;
1521             dst_reg.reg_offset++;
1522          }
1523       }
1524    } else if (ir->type->is_record()) {
1525       foreach_list(node, &ir->components) {
1526          ir_instruction *const field = (ir_instruction *) node;
1527          const unsigned size = type_size(field->type);
1528
1529          field->accept(this);
1530          fs_reg src_reg = this->result;
1531
1532          dst_reg.type = src_reg.type;
1533          for (unsigned j = 0; j < size; j++) {
1534             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1535             src_reg.reg_offset++;
1536             dst_reg.reg_offset++;
1537          }
1538       }
1539    } else {
1540       const unsigned size = type_size(ir->type);
1541
1542       for (unsigned i = 0; i < size; i++) {
1543          switch (ir->type->base_type) {
1544          case GLSL_TYPE_FLOAT:
1545             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1546             break;
1547          case GLSL_TYPE_UINT:
1548             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1549             break;
1550          case GLSL_TYPE_INT:
1551             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1552             break;
1553          case GLSL_TYPE_BOOL:
1554             emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1555             break;
1556          default:
1557             assert(!"Non-float/uint/int/bool constant");
1558          }
1559          dst_reg.reg_offset++;
1560       }
1561    }
1562
1563    this->result = reg;
1564 }
1565
1566 void
1567 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1568 {
1569    ir_expression *expr = ir->as_expression();
1570
1571    if (expr) {
1572       fs_reg op[2];
1573       fs_inst *inst;
1574
1575       assert(expr->get_num_operands() <= 2);
1576       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1577          assert(expr->operands[i]->type->is_scalar());
1578
1579          expr->operands[i]->accept(this);
1580          op[i] = this->result;
1581       }
1582
1583       switch (expr->operation) {
1584       case ir_unop_logic_not:
1585          inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1586          inst->conditional_mod = BRW_CONDITIONAL_Z;
1587          break;
1588
1589       case ir_binop_logic_xor:
1590          inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1591          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1592          break;
1593
1594       case ir_binop_logic_or:
1595          inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1596          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1597          break;
1598
1599       case ir_binop_logic_and:
1600          inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1601          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1602          break;
1603
1604       case ir_unop_f2b:
1605          if (intel->gen >= 6) {
1606             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1607          } else {
1608             inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1609          }
1610          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1611          break;
1612
1613       case ir_unop_i2b:
1614          if (intel->gen >= 6) {
1615             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1616          } else {
1617             inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1618          }
1619          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1620          break;
1621
1622       case ir_binop_greater:
1623       case ir_binop_gequal:
1624       case ir_binop_less:
1625       case ir_binop_lequal:
1626       case ir_binop_equal:
1627       case ir_binop_all_equal:
1628       case ir_binop_nequal:
1629       case ir_binop_any_nequal:
1630          inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1631          inst->conditional_mod =
1632             brw_conditional_for_comparison(expr->operation);
1633          break;
1634
1635       default:
1636          assert(!"not reached");
1637          this->fail = true;
1638          break;
1639       }
1640       return;
1641    }
1642
1643    ir->accept(this);
1644
1645    if (intel->gen >= 6) {
1646       fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1647       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1648    } else {
1649       fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1650       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1651    }
1652 }
1653
1654 /**
1655  * Emit a gen6 IF statement with the comparison folded into the IF
1656  * instruction.
1657  */
1658 void
1659 fs_visitor::emit_if_gen6(ir_if *ir)
1660 {
1661    ir_expression *expr = ir->condition->as_expression();
1662
1663    if (expr) {
1664       fs_reg op[2];
1665       fs_inst *inst;
1666       fs_reg temp;
1667
1668       assert(expr->get_num_operands() <= 2);
1669       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1670          assert(expr->operands[i]->type->is_scalar());
1671
1672          expr->operands[i]->accept(this);
1673          op[i] = this->result;
1674       }
1675
1676       switch (expr->operation) {
1677       case ir_unop_logic_not:
1678          inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1679          inst->conditional_mod = BRW_CONDITIONAL_Z;
1680          return;
1681
1682       case ir_binop_logic_xor:
1683          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1684          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1685          return;
1686
1687       case ir_binop_logic_or:
1688          temp = fs_reg(this, glsl_type::bool_type);
1689          emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1690          inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1691          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1692          return;
1693
1694       case ir_binop_logic_and:
1695          temp = fs_reg(this, glsl_type::bool_type);
1696          emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1697          inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1698          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1699          return;
1700
1701       case ir_unop_f2b:
1702          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1703          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1704          return;
1705
1706       case ir_unop_i2b:
1707          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1708          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1709          return;
1710
1711       case ir_binop_greater:
1712       case ir_binop_gequal:
1713       case ir_binop_less:
1714       case ir_binop_lequal:
1715       case ir_binop_equal:
1716       case ir_binop_all_equal:
1717       case ir_binop_nequal:
1718       case ir_binop_any_nequal:
1719          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1720          inst->conditional_mod =
1721             brw_conditional_for_comparison(expr->operation);
1722          return;
1723       default:
1724          assert(!"not reached");
1725          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1726          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1727          this->fail = true;
1728          return;
1729       }
1730       return;
1731    }
1732
1733    ir->condition->accept(this);
1734
1735    fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1736    inst->conditional_mod = BRW_CONDITIONAL_NZ;
1737 }
1738
1739 void
1740 fs_visitor::visit(ir_if *ir)
1741 {
1742    fs_inst *inst;
1743
1744    /* Don't point the annotation at the if statement, because then it plus
1745     * the then and else blocks get printed.
1746     */
1747    this->base_ir = ir->condition;
1748
1749    if (intel->gen >= 6) {
1750       emit_if_gen6(ir);
1751    } else {
1752       emit_bool_to_cond_code(ir->condition);
1753
1754       inst = emit(BRW_OPCODE_IF);
1755       inst->predicated = true;
1756    }
1757
1758    foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1759       ir_instruction *ir = (ir_instruction *)iter.get();
1760       this->base_ir = ir;
1761
1762       ir->accept(this);
1763    }
1764
1765    if (!ir->else_instructions.is_empty()) {
1766       emit(BRW_OPCODE_ELSE);
1767
1768       foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1769          ir_instruction *ir = (ir_instruction *)iter.get();
1770          this->base_ir = ir;
1771
1772          ir->accept(this);
1773       }
1774    }
1775
1776    emit(BRW_OPCODE_ENDIF);
1777 }
1778
1779 void
1780 fs_visitor::visit(ir_loop *ir)
1781 {
1782    fs_reg counter = reg_undef;
1783
1784    if (ir->counter) {
1785       this->base_ir = ir->counter;
1786       ir->counter->accept(this);
1787       counter = *(variable_storage(ir->counter));
1788
1789       if (ir->from) {
1790          this->base_ir = ir->from;
1791          ir->from->accept(this);
1792
1793          emit(BRW_OPCODE_MOV, counter, this->result);
1794       }
1795    }
1796
1797    emit(BRW_OPCODE_DO);
1798
1799    if (ir->to) {
1800       this->base_ir = ir->to;
1801       ir->to->accept(this);
1802
1803       fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1804       inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1805
1806       inst = emit(BRW_OPCODE_BREAK);
1807       inst->predicated = true;
1808    }
1809
1810    foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1811       ir_instruction *ir = (ir_instruction *)iter.get();
1812
1813       this->base_ir = ir;
1814       ir->accept(this);
1815    }
1816
1817    if (ir->increment) {
1818       this->base_ir = ir->increment;
1819       ir->increment->accept(this);
1820       emit(BRW_OPCODE_ADD, counter, counter, this->result);
1821    }
1822
1823    emit(BRW_OPCODE_WHILE);
1824 }
1825
1826 void
1827 fs_visitor::visit(ir_loop_jump *ir)
1828 {
1829    switch (ir->mode) {
1830    case ir_loop_jump::jump_break:
1831       emit(BRW_OPCODE_BREAK);
1832       break;
1833    case ir_loop_jump::jump_continue:
1834       emit(BRW_OPCODE_CONTINUE);
1835       break;
1836    }
1837 }
1838
1839 void
1840 fs_visitor::visit(ir_call *ir)
1841 {
1842    assert(!"FINISHME");
1843 }
1844
1845 void
1846 fs_visitor::visit(ir_return *ir)
1847 {
1848    assert(!"FINISHME");
1849 }
1850
1851 void
1852 fs_visitor::visit(ir_function *ir)
1853 {
1854    /* Ignore function bodies other than main() -- we shouldn't see calls to
1855     * them since they should all be inlined before we get to ir_to_mesa.
1856     */
1857    if (strcmp(ir->name, "main") == 0) {
1858       const ir_function_signature *sig;
1859       exec_list empty;
1860
1861       sig = ir->matching_signature(&empty);
1862
1863       assert(sig);
1864
1865       foreach_iter(exec_list_iterator, iter, sig->body) {
1866          ir_instruction *ir = (ir_instruction *)iter.get();
1867          this->base_ir = ir;
1868
1869          ir->accept(this);
1870       }
1871    }
1872 }
1873
1874 void
1875 fs_visitor::visit(ir_function_signature *ir)
1876 {
1877    assert(!"not reached");
1878    (void)ir;
1879 }
1880
1881 fs_inst *
1882 fs_visitor::emit(fs_inst inst)
1883 {
1884    fs_inst *list_inst = new(mem_ctx) fs_inst;
1885    *list_inst = inst;
1886
1887    list_inst->annotation = this->current_annotation;
1888    list_inst->ir = this->base_ir;
1889
1890    this->instructions.push_tail(list_inst);
1891
1892    return list_inst;
1893 }
1894
1895 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1896 void
1897 fs_visitor::emit_dummy_fs()
1898 {
1899    /* Everyone's favorite color. */
1900    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1901    emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1902    emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1903    emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1904
1905    fs_inst *write;
1906    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1907    write->base_mrf = 0;
1908 }
1909
1910 /* The register location here is relative to the start of the URB
1911  * data.  It will get adjusted to be a real location before
1912  * generate_code() time.
1913  */
1914 struct brw_reg
1915 fs_visitor::interp_reg(int location, int channel)
1916 {
1917    int regnr = urb_setup[location] * 2 + channel / 2;
1918    int stride = (channel & 1) * 4;
1919
1920    assert(urb_setup[location] != -1);
1921
1922    return brw_vec1_grf(regnr, stride);
1923 }
1924
1925 /** Emits the interpolation for the varying inputs. */
1926 void
1927 fs_visitor::emit_interpolation_setup_gen4()
1928 {
1929    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1930
1931    this->current_annotation = "compute pixel centers";
1932    this->pixel_x = fs_reg(this, glsl_type::uint_type);
1933    this->pixel_y = fs_reg(this, glsl_type::uint_type);
1934    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1935    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1936    emit(BRW_OPCODE_ADD,
1937         this->pixel_x,
1938         fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1939         fs_reg(brw_imm_v(0x10101010)));
1940    emit(BRW_OPCODE_ADD,
1941         this->pixel_y,
1942         fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1943         fs_reg(brw_imm_v(0x11001100)));
1944
1945    this->current_annotation = "compute pixel deltas from v0";
1946    if (brw->has_pln) {
1947       this->delta_x = fs_reg(this, glsl_type::vec2_type);
1948       this->delta_y = this->delta_x;
1949       this->delta_y.reg_offset++;
1950    } else {
1951       this->delta_x = fs_reg(this, glsl_type::float_type);
1952       this->delta_y = fs_reg(this, glsl_type::float_type);
1953    }
1954    emit(BRW_OPCODE_ADD, this->delta_x,
1955         this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1956    emit(BRW_OPCODE_ADD, this->delta_y,
1957         this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1958
1959    this->current_annotation = "compute pos.w and 1/pos.w";
1960    /* Compute wpos.w.  It's always in our setup, since it's needed to
1961     * interpolate the other attributes.
1962     */
1963    this->wpos_w = fs_reg(this, glsl_type::float_type);
1964    emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1965         interp_reg(FRAG_ATTRIB_WPOS, 3));
1966    /* Compute the pixel 1/W value from wpos.w. */
1967    this->pixel_w = fs_reg(this, glsl_type::float_type);
1968    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1969    this->current_annotation = NULL;
1970 }
1971
1972 /** Emits the interpolation for the varying inputs. */
1973 void
1974 fs_visitor::emit_interpolation_setup_gen6()
1975 {
1976    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1977
1978    /* If the pixel centers end up used, the setup is the same as for gen4. */
1979    this->current_annotation = "compute pixel centers";
1980    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1981    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1982    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1983    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1984    emit(BRW_OPCODE_ADD,
1985         int_pixel_x,
1986         fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1987         fs_reg(brw_imm_v(0x10101010)));
1988    emit(BRW_OPCODE_ADD,
1989         int_pixel_y,
1990         fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1991         fs_reg(brw_imm_v(0x11001100)));
1992
1993    /* As of gen6, we can no longer mix float and int sources.  We have
1994     * to turn the integer pixel centers into floats for their actual
1995     * use.
1996     */
1997    this->pixel_x = fs_reg(this, glsl_type::float_type);
1998    this->pixel_y = fs_reg(this, glsl_type::float_type);
1999    emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2000    emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2001
2002    this->current_annotation = "compute 1/pos.w";
2003    this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2004    this->pixel_w = fs_reg(this, glsl_type::float_type);
2005    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2006
2007    this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2008    this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2009
2010    this->current_annotation = NULL;
2011 }
2012
2013 void
2014 fs_visitor::emit_fb_writes()
2015 {
2016    this->current_annotation = "FB write header";
2017    GLboolean header_present = GL_TRUE;
2018    int nr = 0;
2019
2020    if (intel->gen >= 6 &&
2021        !this->kill_emitted &&
2022        c->key.nr_color_regions == 1) {
2023       header_present = false;
2024    }
2025
2026    if (header_present) {
2027       /* m0, m1 header */
2028       nr += 2;
2029    }
2030
2031    if (c->aa_dest_stencil_reg) {
2032       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2033            fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2034    }
2035
2036    /* Reserve space for color. It'll be filled in per MRT below. */
2037    int color_mrf = nr;
2038    nr += 4;
2039
2040    if (c->source_depth_to_render_target) {
2041       if (c->computes_depth) {
2042          /* Hand over gl_FragDepth. */
2043          assert(this->frag_depth);
2044          fs_reg depth = *(variable_storage(this->frag_depth));
2045
2046          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
2047       } else {
2048          /* Pass through the payload depth. */
2049          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2050               fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2051       }
2052    }
2053
2054    if (c->dest_depth_reg) {
2055       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2056            fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2057    }
2058
2059    fs_reg color = reg_undef;
2060    if (this->frag_color)
2061       color = *(variable_storage(this->frag_color));
2062    else if (this->frag_data) {
2063       color = *(variable_storage(this->frag_data));
2064       color.type = BRW_REGISTER_TYPE_F;
2065    }
2066
2067    for (int target = 0; target < c->key.nr_color_regions; target++) {
2068       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2069                                                  "FB write target %d",
2070                                                  target);
2071       if (this->frag_color || this->frag_data) {
2072          for (int i = 0; i < 4; i++) {
2073             emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
2074             color.reg_offset++;
2075          }
2076       }
2077
2078       if (this->frag_color)
2079          color.reg_offset -= 4;
2080
2081       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2082       inst->target = target;
2083       inst->base_mrf = 0;
2084       inst->mlen = nr;
2085       if (target == c->key.nr_color_regions - 1)
2086          inst->eot = true;
2087       inst->header_present = header_present;
2088    }
2089
2090    if (c->key.nr_color_regions == 0) {
2091       if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2092          /* If the alpha test is enabled but there's no color buffer,
2093           * we still need to send alpha out the pipeline to our null
2094           * renderbuffer.
2095           */
2096          color.reg_offset += 3;
2097          emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
2098       }
2099
2100       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2101       inst->base_mrf = 0;
2102       inst->mlen = nr;
2103       inst->eot = true;
2104       inst->header_present = header_present;
2105    }
2106
2107    this->current_annotation = NULL;
2108 }
2109
2110 void
2111 fs_visitor::generate_fb_write(fs_inst *inst)
2112 {
2113    GLboolean eot = inst->eot;
2114    struct brw_reg implied_header;
2115
2116    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2117     * move, here's g1.
2118     */
2119    brw_push_insn_state(p);
2120    brw_set_mask_control(p, BRW_MASK_DISABLE);
2121    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2122
2123    if (inst->header_present) {
2124       if (intel->gen >= 6) {
2125          brw_MOV(p,
2126                  brw_message_reg(inst->base_mrf),
2127                  brw_vec8_grf(0, 0));
2128
2129          if (inst->target > 0) {
2130             /* Set the render target index for choosing BLEND_STATE. */
2131             brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2132                               BRW_REGISTER_TYPE_UD),
2133                     brw_imm_ud(inst->target));
2134          }
2135
2136          /* Clear viewport index, render target array index. */
2137          brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2138                            BRW_REGISTER_TYPE_UD),
2139                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2140                  brw_imm_ud(0xf7ff));
2141
2142          implied_header = brw_null_reg();
2143       } else {
2144          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2145       }
2146
2147       brw_MOV(p,
2148               brw_message_reg(inst->base_mrf + 1),
2149               brw_vec8_grf(1, 0));
2150    } else {
2151       implied_header = brw_null_reg();
2152    }
2153
2154    brw_pop_insn_state(p);
2155
2156    brw_fb_WRITE(p,
2157                 8, /* dispatch_width */
2158                 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2159                 inst->base_mrf,
2160                 implied_header,
2161                 inst->target,
2162                 inst->mlen,
2163                 0,
2164                 eot,
2165                 inst->header_present);
2166 }
2167
2168 void
2169 fs_visitor::generate_linterp(fs_inst *inst,
2170                              struct brw_reg dst, struct brw_reg *src)
2171 {
2172    struct brw_reg delta_x = src[0];
2173    struct brw_reg delta_y = src[1];
2174    struct brw_reg interp = src[2];
2175
2176    if (brw->has_pln &&
2177        delta_y.nr == delta_x.nr + 1 &&
2178        (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2179       brw_PLN(p, dst, interp, delta_x);
2180    } else {
2181       brw_LINE(p, brw_null_reg(), interp, delta_x);
2182       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2183    }
2184 }
2185
2186 void
2187 fs_visitor::generate_math(fs_inst *inst,
2188                           struct brw_reg dst, struct brw_reg *src)
2189 {
2190    int op;
2191
2192    switch (inst->opcode) {
2193    case FS_OPCODE_RCP:
2194       op = BRW_MATH_FUNCTION_INV;
2195       break;
2196    case FS_OPCODE_RSQ:
2197       op = BRW_MATH_FUNCTION_RSQ;
2198       break;
2199    case FS_OPCODE_SQRT:
2200       op = BRW_MATH_FUNCTION_SQRT;
2201       break;
2202    case FS_OPCODE_EXP2:
2203       op = BRW_MATH_FUNCTION_EXP;
2204       break;
2205    case FS_OPCODE_LOG2:
2206       op = BRW_MATH_FUNCTION_LOG;
2207       break;
2208    case FS_OPCODE_POW:
2209       op = BRW_MATH_FUNCTION_POW;
2210       break;
2211    case FS_OPCODE_SIN:
2212       op = BRW_MATH_FUNCTION_SIN;
2213       break;
2214    case FS_OPCODE_COS:
2215       op = BRW_MATH_FUNCTION_COS;
2216       break;
2217    default:
2218       assert(!"not reached: unknown math function");
2219       op = 0;
2220       break;
2221    }
2222
2223    if (intel->gen >= 6) {
2224       assert(inst->mlen == 0);
2225
2226       if (inst->opcode == FS_OPCODE_POW) {
2227          brw_math2(p, dst, op, src[0], src[1]);
2228       } else {
2229          brw_math(p, dst,
2230                   op,
2231                   inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2232                   BRW_MATH_SATURATE_NONE,
2233                   0, src[0],
2234                   BRW_MATH_DATA_VECTOR,
2235                   BRW_MATH_PRECISION_FULL);
2236       }
2237    } else {
2238       assert(inst->mlen >= 1);
2239
2240       brw_math(p, dst,
2241                op,
2242                inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2243                BRW_MATH_SATURATE_NONE,
2244                inst->base_mrf, src[0],
2245                BRW_MATH_DATA_VECTOR,
2246                BRW_MATH_PRECISION_FULL);
2247    }
2248 }
2249
2250 void
2251 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2252 {
2253    int msg_type = -1;
2254    int rlen = 4;
2255    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2256
2257    if (intel->gen >= 5) {
2258       switch (inst->opcode) {
2259       case FS_OPCODE_TEX:
2260          if (inst->shadow_compare) {
2261             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2262          } else {
2263             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2264          }
2265          break;
2266       case FS_OPCODE_TXB:
2267          if (inst->shadow_compare) {
2268             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2269          } else {
2270             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2271          }
2272          break;
2273       case FS_OPCODE_TXL:
2274          if (inst->shadow_compare) {
2275             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2276          } else {
2277             msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2278          }
2279          break;
2280       case FS_OPCODE_TXD:
2281          assert(!"TXD isn't supported on gen5+ yet.");
2282          break;
2283       }
2284    } else {
2285       switch (inst->opcode) {
2286       case FS_OPCODE_TEX:
2287          /* Note that G45 and older determines shadow compare and dispatch width
2288           * from message length for most messages.
2289           */
2290          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2291          if (inst->shadow_compare) {
2292             assert(inst->mlen == 6);
2293          } else {
2294             assert(inst->mlen <= 4);
2295          }
2296          break;
2297       case FS_OPCODE_TXB:
2298          if (inst->shadow_compare) {
2299             assert(inst->mlen == 6);
2300             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2301          } else {
2302             assert(inst->mlen == 9);
2303             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2304             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2305          }
2306          break;
2307       case FS_OPCODE_TXL:
2308          if (inst->shadow_compare) {
2309             assert(inst->mlen == 6);
2310             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2311          } else {
2312             assert(inst->mlen == 9);
2313             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2314             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2315          }
2316          break;
2317       case FS_OPCODE_TXD:
2318          assert(!"TXD isn't supported on gen4 yet.");
2319          break;
2320       }
2321    }
2322    assert(msg_type != -1);
2323
2324    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2325       rlen = 8;
2326       dst = vec16(dst);
2327    }
2328
2329    brw_SAMPLE(p,
2330               retype(dst, BRW_REGISTER_TYPE_UW),
2331               inst->base_mrf,
2332               src,
2333               SURF_INDEX_TEXTURE(inst->sampler),
2334               inst->sampler,
2335               WRITEMASK_XYZW,
2336               msg_type,
2337               rlen,
2338               inst->mlen,
2339               0,
2340               1,
2341               simd_mode);
2342 }
2343
2344
2345 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2346  * looking like:
2347  *
2348  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2349  *
2350  * and we're trying to produce:
2351  *
2352  *           DDX                     DDY
2353  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2354  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2355  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2356  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2357  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2358  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2359  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2360  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2361  *
2362  * and add another set of two more subspans if in 16-pixel dispatch mode.
2363  *
2364  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2365  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2366  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2367  * between each other.  We could probably do it like ddx and swizzle the right
2368  * order later, but bail for now and just produce
2369  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2370  */
2371 void
2372 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2373 {
2374    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2375                                  BRW_REGISTER_TYPE_F,
2376                                  BRW_VERTICAL_STRIDE_2,
2377                                  BRW_WIDTH_2,
2378                                  BRW_HORIZONTAL_STRIDE_0,
2379                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2380    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2381                                  BRW_REGISTER_TYPE_F,
2382                                  BRW_VERTICAL_STRIDE_2,
2383                                  BRW_WIDTH_2,
2384                                  BRW_HORIZONTAL_STRIDE_0,
2385                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2386    brw_ADD(p, dst, src0, negate(src1));
2387 }
2388
2389 void
2390 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2391 {
2392    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2393                                  BRW_REGISTER_TYPE_F,
2394                                  BRW_VERTICAL_STRIDE_4,
2395                                  BRW_WIDTH_4,
2396                                  BRW_HORIZONTAL_STRIDE_0,
2397                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2398    struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2399                                  BRW_REGISTER_TYPE_F,
2400                                  BRW_VERTICAL_STRIDE_4,
2401                                  BRW_WIDTH_4,
2402                                  BRW_HORIZONTAL_STRIDE_0,
2403                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2404    brw_ADD(p, dst, src0, negate(src1));
2405 }
2406
2407 void
2408 fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2409 {
2410    if (intel->gen >= 6) {
2411       /* Gen6 no longer has the mask reg for us to just read the
2412        * active channels from.  However, cmp updates just the channels
2413        * of the flag reg that are enabled, so we can get at the
2414        * channel enables that way.  In this step, make a reg of ones
2415        * we'll compare to.
2416        */
2417       brw_MOV(p, mask, brw_imm_ud(1));
2418    } else {
2419       brw_push_insn_state(p);
2420       brw_set_mask_control(p, BRW_MASK_DISABLE);
2421       brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2422       brw_pop_insn_state(p);
2423    }
2424 }
2425
2426 void
2427 fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2428 {
2429    if (intel->gen >= 6) {
2430       struct brw_reg f0 = brw_flag_reg();
2431       struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2432
2433       brw_push_insn_state(p);
2434       brw_set_mask_control(p, BRW_MASK_DISABLE);
2435       brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2436       brw_pop_insn_state(p);
2437
2438       brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2439               BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2440       /* Undo CMP's whacking of predication*/
2441       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2442
2443       brw_push_insn_state(p);
2444       brw_set_mask_control(p, BRW_MASK_DISABLE);
2445       brw_AND(p, g1, f0, g1);
2446       brw_pop_insn_state(p);
2447    } else {
2448       struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2449
2450       mask = brw_uw1_reg(mask.file, mask.nr, 0);
2451
2452       brw_push_insn_state(p);
2453       brw_set_mask_control(p, BRW_MASK_DISABLE);
2454       brw_AND(p, g0, mask, g0);
2455       brw_pop_insn_state(p);
2456    }
2457 }
2458
2459 void
2460 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2461 {
2462    assert(inst->mlen != 0);
2463
2464    brw_MOV(p,
2465            retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2466            retype(src, BRW_REGISTER_TYPE_UD));
2467    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2468                                  inst->offset);
2469 }
2470
2471 void
2472 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2473 {
2474    assert(inst->mlen != 0);
2475
2476    /* Clear any post destination dependencies that would be ignored by
2477     * the block read.  See the B-Spec for pre-gen5 send instruction.
2478     *
2479     * This could use a better solution, since texture sampling and
2480     * math reads could potentially run into it as well -- anywhere
2481     * that we have a SEND with a destination that is a register that
2482     * was written but not read within the last N instructions (what's
2483     * N?  unsure).  This is rare because of dead code elimination, but
2484     * not impossible.
2485     */
2486    if (intel->gen == 4 && !intel->is_g4x)
2487       brw_MOV(p, brw_null_reg(), dst);
2488
2489    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2490                                 inst->offset);
2491
2492    if (intel->gen == 4 && !intel->is_g4x) {
2493       /* gen4 errata: destination from a send can't be used as a
2494        * destination until it's been read.  Just read it so we don't
2495        * have to worry.
2496        */
2497       brw_MOV(p, brw_null_reg(), dst);
2498    }
2499 }
2500
2501
2502 void
2503 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2504 {
2505    assert(inst->mlen != 0);
2506
2507    /* Clear any post destination dependencies that would be ignored by
2508     * the block read.  See the B-Spec for pre-gen5 send instruction.
2509     *
2510     * This could use a better solution, since texture sampling and
2511     * math reads could potentially run into it as well -- anywhere
2512     * that we have a SEND with a destination that is a register that
2513     * was written but not read within the last N instructions (what's
2514     * N?  unsure).  This is rare because of dead code elimination, but
2515     * not impossible.
2516     */
2517    if (intel->gen == 4 && !intel->is_g4x)
2518       brw_MOV(p, brw_null_reg(), dst);
2519
2520    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2521                         inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2522
2523    if (intel->gen == 4 && !intel->is_g4x) {
2524       /* gen4 errata: destination from a send can't be used as a
2525        * destination until it's been read.  Just read it so we don't
2526        * have to worry.
2527        */
2528       brw_MOV(p, brw_null_reg(), dst);
2529    }
2530 }
2531
2532 /**
2533  * To be called after the last _mesa_add_state_reference() call, to
2534  * set up prog_data.param[] for assign_curb_setup() and
2535  * setup_pull_constants().
2536  */
2537 void
2538 fs_visitor::setup_paramvalues_refs()
2539 {
2540    /* Set up the pointers to ParamValues now that that array is finalized. */
2541    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2542       c->prog_data.param[i] =
2543          fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2544          this->param_offset[i];
2545    }
2546 }
2547
2548 void
2549 fs_visitor::assign_curb_setup()
2550 {
2551    c->prog_data.first_curbe_grf = c->nr_payload_regs;
2552    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2553
2554    /* Map the offsets in the UNIFORM file to fixed HW regs. */
2555    foreach_iter(exec_list_iterator, iter, this->instructions) {
2556       fs_inst *inst = (fs_inst *)iter.get();
2557
2558       for (unsigned int i = 0; i < 3; i++) {
2559          if (inst->src[i].file == UNIFORM) {
2560             int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2561             struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2562                                                   constant_nr / 8,
2563                                                   constant_nr % 8);
2564
2565             inst->src[i].file = FIXED_HW_REG;
2566             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2567          }
2568       }
2569    }
2570 }
2571
2572 void
2573 fs_visitor::calculate_urb_setup()
2574 {
2575    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2576       urb_setup[i] = -1;
2577    }
2578
2579    int urb_next = 0;
2580    /* Figure out where each of the incoming setup attributes lands. */
2581    if (intel->gen >= 6) {
2582       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2583          if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2584             urb_setup[i] = urb_next++;
2585          }
2586       }
2587    } else {
2588       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2589       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2590          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2591             int fp_index;
2592
2593             if (i >= VERT_RESULT_VAR0)
2594                fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2595             else if (i <= VERT_RESULT_TEX7)
2596                fp_index = i;
2597             else
2598                fp_index = -1;
2599
2600             if (fp_index >= 0)
2601                urb_setup[fp_index] = urb_next++;
2602          }
2603       }
2604    }
2605
2606    /* Each attribute is 4 setup channels, each of which is half a reg. */
2607    c->prog_data.urb_read_length = urb_next * 2;
2608 }
2609
2610 void
2611 fs_visitor::assign_urb_setup()
2612 {
2613    int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2614
2615    /* Offset all the urb_setup[] index by the actual position of the
2616     * setup regs, now that the location of the constants has been chosen.
2617     */
2618    foreach_iter(exec_list_iterator, iter, this->instructions) {
2619       fs_inst *inst = (fs_inst *)iter.get();
2620
2621       if (inst->opcode == FS_OPCODE_LINTERP) {
2622          assert(inst->src[2].file == FIXED_HW_REG);
2623          inst->src[2].fixed_hw_reg.nr += urb_start;
2624       }
2625
2626       if (inst->opcode == FS_OPCODE_CINTERP) {
2627          assert(inst->src[0].file == FIXED_HW_REG);
2628          inst->src[0].fixed_hw_reg.nr += urb_start;
2629       }
2630    }
2631
2632    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2633 }
2634
2635 /**
2636  * Split large virtual GRFs into separate components if we can.
2637  *
2638  * This is mostly duplicated with what brw_fs_vector_splitting does,
2639  * but that's really conservative because it's afraid of doing
2640  * splitting that doesn't result in real progress after the rest of
2641  * the optimization phases, which would cause infinite looping in
2642  * optimization.  We can do it once here, safely.  This also has the
2643  * opportunity to split interpolated values, or maybe even uniforms,
2644  * which we don't have at the IR level.
2645  *
2646  * We want to split, because virtual GRFs are what we register
2647  * allocate and spill (due to contiguousness requirements for some
2648  * instructions), and they're what we naturally generate in the
2649  * codegen process, but most virtual GRFs don't actually need to be
2650  * contiguous sets of GRFs.  If we split, we'll end up with reduced
2651  * live intervals and better dead code elimination and coalescing.
2652  */
2653 void
2654 fs_visitor::split_virtual_grfs()
2655 {
2656    int num_vars = this->virtual_grf_next;
2657    bool split_grf[num_vars];
2658    int new_virtual_grf[num_vars];
2659
2660    /* Try to split anything > 0 sized. */
2661    for (int i = 0; i < num_vars; i++) {
2662       if (this->virtual_grf_sizes[i] != 1)
2663          split_grf[i] = true;
2664       else
2665          split_grf[i] = false;
2666    }
2667
2668    if (brw->has_pln) {
2669       /* PLN opcodes rely on the delta_xy being contiguous. */
2670       split_grf[this->delta_x.reg] = false;
2671    }
2672
2673    foreach_iter(exec_list_iterator, iter, this->instructions) {
2674       fs_inst *inst = (fs_inst *)iter.get();
2675
2676       /* Texturing produces 4 contiguous registers, so no splitting. */
2677       if (inst->is_tex()) {
2678          split_grf[inst->dst.reg] = false;
2679       }
2680    }
2681
2682    /* Allocate new space for split regs.  Note that the virtual
2683     * numbers will be contiguous.
2684     */
2685    for (int i = 0; i < num_vars; i++) {
2686       if (split_grf[i]) {
2687          new_virtual_grf[i] = virtual_grf_alloc(1);
2688          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2689             int reg = virtual_grf_alloc(1);
2690             assert(reg == new_virtual_grf[i] + j - 1);
2691             (void) reg;
2692          }
2693          this->virtual_grf_sizes[i] = 1;
2694       }
2695    }
2696
2697    foreach_iter(exec_list_iterator, iter, this->instructions) {
2698       fs_inst *inst = (fs_inst *)iter.get();
2699
2700       if (inst->dst.file == GRF &&
2701           split_grf[inst->dst.reg] &&
2702           inst->dst.reg_offset != 0) {
2703          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2704                           inst->dst.reg_offset - 1);
2705          inst->dst.reg_offset = 0;
2706       }
2707       for (int i = 0; i < 3; i++) {
2708          if (inst->src[i].file == GRF &&
2709              split_grf[inst->src[i].reg] &&
2710              inst->src[i].reg_offset != 0) {
2711             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2712                                 inst->src[i].reg_offset - 1);
2713             inst->src[i].reg_offset = 0;
2714          }
2715       }
2716    }
2717    this->live_intervals_valid = false;
2718 }
2719
2720 /**
2721  * Choose accesses from the UNIFORM file to demote to using the pull
2722  * constant buffer.
2723  *
2724  * We allow a fragment shader to have more than the specified minimum
2725  * maximum number of fragment shader uniform components (64).  If
2726  * there are too many of these, they'd fill up all of register space.
2727  * So, this will push some of them out to the pull constant buffer and
2728  * update the program to load them.
2729  */
2730 void
2731 fs_visitor::setup_pull_constants()
2732 {
2733    /* Only allow 16 registers (128 uniform components) as push constants. */
2734    unsigned int max_uniform_components = 16 * 8;
2735    if (c->prog_data.nr_params <= max_uniform_components)
2736       return;
2737
2738    /* Just demote the end of the list.  We could probably do better
2739     * here, demoting things that are rarely used in the program first.
2740     */
2741    int pull_uniform_base = max_uniform_components;
2742    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2743
2744    foreach_iter(exec_list_iterator, iter, this->instructions) {
2745       fs_inst *inst = (fs_inst *)iter.get();
2746
2747       for (int i = 0; i < 3; i++) {
2748          if (inst->src[i].file != UNIFORM)
2749             continue;
2750
2751          int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2752          if (uniform_nr < pull_uniform_base)
2753             continue;
2754
2755          fs_reg dst = fs_reg(this, glsl_type::float_type);
2756          fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2757                                               dst);
2758          pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2759          pull->ir = inst->ir;
2760          pull->annotation = inst->annotation;
2761          pull->base_mrf = 14;
2762          pull->mlen = 1;
2763
2764          inst->insert_before(pull);
2765
2766          inst->src[i].file = GRF;
2767          inst->src[i].reg = dst.reg;
2768          inst->src[i].reg_offset = 0;
2769          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2770       }
2771    }
2772
2773    for (int i = 0; i < pull_uniform_count; i++) {
2774       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2775       c->prog_data.pull_param_convert[i] =
2776          c->prog_data.param_convert[pull_uniform_base + i];
2777    }
2778    c->prog_data.nr_params -= pull_uniform_count;
2779    c->prog_data.nr_pull_params = pull_uniform_count;
2780 }
2781
2782 void
2783 fs_visitor::calculate_live_intervals()
2784 {
2785    int num_vars = this->virtual_grf_next;
2786    int *def = ralloc_array(mem_ctx, int, num_vars);
2787    int *use = ralloc_array(mem_ctx, int, num_vars);
2788    int loop_depth = 0;
2789    int loop_start = 0;
2790    int bb_header_ip = 0;
2791
2792    if (this->live_intervals_valid)
2793       return;
2794
2795    for (int i = 0; i < num_vars; i++) {
2796       def[i] = MAX_INSTRUCTION;
2797       use[i] = -1;
2798    }
2799
2800    int ip = 0;
2801    foreach_iter(exec_list_iterator, iter, this->instructions) {
2802       fs_inst *inst = (fs_inst *)iter.get();
2803
2804       if (inst->opcode == BRW_OPCODE_DO) {
2805          if (loop_depth++ == 0)
2806             loop_start = ip;
2807       } else if (inst->opcode == BRW_OPCODE_WHILE) {
2808          loop_depth--;
2809
2810          if (loop_depth == 0) {
2811             /* Patches up the use of vars marked for being live across
2812              * the whole loop.
2813              */
2814             for (int i = 0; i < num_vars; i++) {
2815                if (use[i] == loop_start) {
2816                   use[i] = ip;
2817                }
2818             }
2819          }
2820       } else {
2821          for (unsigned int i = 0; i < 3; i++) {
2822             if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2823                int reg = inst->src[i].reg;
2824
2825                if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2826                                    def[reg] >= bb_header_ip)) {
2827                   use[reg] = ip;
2828                } else {
2829                   def[reg] = MIN2(loop_start, def[reg]);
2830                   use[reg] = loop_start;
2831
2832                   /* Nobody else is going to go smash our start to
2833                    * later in the loop now, because def[reg] now
2834                    * points before the bb header.
2835                    */
2836                }
2837             }
2838          }
2839          if (inst->dst.file == GRF && inst->dst.reg != 0) {
2840             int reg = inst->dst.reg;
2841
2842             if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2843                                 !inst->predicated)) {
2844                def[reg] = MIN2(def[reg], ip);
2845             } else {
2846                def[reg] = MIN2(def[reg], loop_start);
2847             }
2848          }
2849       }
2850
2851       ip++;
2852
2853       /* Set the basic block header IP.  This is used for determining
2854        * if a complete def of single-register virtual GRF in a loop
2855        * dominates a use in the same basic block.  It's a quick way to
2856        * reduce the live interval range of most register used in a
2857        * loop.
2858        */
2859       if (inst->opcode == BRW_OPCODE_IF ||
2860           inst->opcode == BRW_OPCODE_ELSE ||
2861           inst->opcode == BRW_OPCODE_ENDIF ||
2862           inst->opcode == BRW_OPCODE_DO ||
2863           inst->opcode == BRW_OPCODE_WHILE ||
2864           inst->opcode == BRW_OPCODE_BREAK ||
2865           inst->opcode == BRW_OPCODE_CONTINUE) {
2866          bb_header_ip = ip;
2867       }
2868    }
2869
2870    ralloc_free(this->virtual_grf_def);
2871    ralloc_free(this->virtual_grf_use);
2872    this->virtual_grf_def = def;
2873    this->virtual_grf_use = use;
2874
2875    this->live_intervals_valid = true;
2876 }
2877
2878 /**
2879  * Attempts to move immediate constants into the immediate
2880  * constant slot of following instructions.
2881  *
2882  * Immediate constants are a bit tricky -- they have to be in the last
2883  * operand slot, you can't do abs/negate on them,
2884  */
2885
2886 bool
2887 fs_visitor::propagate_constants()
2888 {
2889    bool progress = false;
2890
2891    calculate_live_intervals();
2892
2893    foreach_iter(exec_list_iterator, iter, this->instructions) {
2894       fs_inst *inst = (fs_inst *)iter.get();
2895
2896       if (inst->opcode != BRW_OPCODE_MOV ||
2897           inst->predicated ||
2898           inst->dst.file != GRF || inst->src[0].file != IMM ||
2899           inst->dst.type != inst->src[0].type)
2900          continue;
2901
2902       /* Don't bother with cases where we should have had the
2903        * operation on the constant folded in GLSL already.
2904        */
2905       if (inst->saturate)
2906          continue;
2907
2908       /* Found a move of a constant to a GRF.  Find anything else using the GRF
2909        * before it's written, and replace it with the constant if we can.
2910        */
2911       exec_list_iterator scan_iter = iter;
2912       scan_iter.next();
2913       for (; scan_iter.has_next(); scan_iter.next()) {
2914          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2915
2916          if (scan_inst->opcode == BRW_OPCODE_DO ||
2917              scan_inst->opcode == BRW_OPCODE_WHILE ||
2918              scan_inst->opcode == BRW_OPCODE_ELSE ||
2919              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2920             break;
2921          }
2922
2923          for (int i = 2; i >= 0; i--) {
2924             if (scan_inst->src[i].file != GRF ||
2925                 scan_inst->src[i].reg != inst->dst.reg ||
2926                 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2927                continue;
2928
2929             /* Don't bother with cases where we should have had the
2930              * operation on the constant folded in GLSL already.
2931              */
2932             if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2933                continue;
2934
2935             switch (scan_inst->opcode) {
2936             case BRW_OPCODE_MOV:
2937                scan_inst->src[i] = inst->src[0];
2938                progress = true;
2939                break;
2940
2941             case BRW_OPCODE_MUL:
2942             case BRW_OPCODE_ADD:
2943                if (i == 1) {
2944                   scan_inst->src[i] = inst->src[0];
2945                   progress = true;
2946                } else if (i == 0 && scan_inst->src[1].file != IMM) {
2947                   /* Fit this constant in by commuting the operands */
2948                   scan_inst->src[0] = scan_inst->src[1];
2949                   scan_inst->src[1] = inst->src[0];
2950                   progress = true;
2951                }
2952                break;
2953             case BRW_OPCODE_CMP:
2954             case BRW_OPCODE_SEL:
2955                if (i == 1) {
2956                   scan_inst->src[i] = inst->src[0];
2957                   progress = true;
2958                }
2959             }
2960          }
2961
2962          if (scan_inst->dst.file == GRF &&
2963              scan_inst->dst.reg == inst->dst.reg &&
2964              (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2965               scan_inst->is_tex())) {
2966             break;
2967          }
2968       }
2969    }
2970
2971    if (progress)
2972        this->live_intervals_valid = false;
2973
2974    return progress;
2975 }
2976 /**
2977  * Must be called after calculate_live_intervales() to remove unused
2978  * writes to registers -- register allocation will fail otherwise
2979  * because something deffed but not used won't be considered to
2980  * interfere with other regs.
2981  */
2982 bool
2983 fs_visitor::dead_code_eliminate()
2984 {
2985    bool progress = false;
2986    int pc = 0;
2987
2988    calculate_live_intervals();
2989
2990    foreach_iter(exec_list_iterator, iter, this->instructions) {
2991       fs_inst *inst = (fs_inst *)iter.get();
2992
2993       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2994          inst->remove();
2995          progress = true;
2996       }
2997
2998       pc++;
2999    }
3000
3001    if (progress)
3002       live_intervals_valid = false;
3003
3004    return progress;
3005 }
3006
3007 bool
3008 fs_visitor::register_coalesce()
3009 {
3010    bool progress = false;
3011    int if_depth = 0;
3012    int loop_depth = 0;
3013
3014    foreach_iter(exec_list_iterator, iter, this->instructions) {
3015       fs_inst *inst = (fs_inst *)iter.get();
3016
3017       /* Make sure that we dominate the instructions we're going to
3018        * scan for interfering with our coalescing, or we won't have
3019        * scanned enough to see if anything interferes with our
3020        * coalescing.  We don't dominate the following instructions if
3021        * we're in a loop or an if block.
3022        */
3023       switch (inst->opcode) {
3024       case BRW_OPCODE_DO:
3025          loop_depth++;
3026          break;
3027       case BRW_OPCODE_WHILE:
3028          loop_depth--;
3029          break;
3030       case BRW_OPCODE_IF:
3031          if_depth++;
3032          break;
3033       case BRW_OPCODE_ENDIF:
3034          if_depth--;
3035          break;
3036       }
3037       if (loop_depth || if_depth)
3038          continue;
3039
3040       if (inst->opcode != BRW_OPCODE_MOV ||
3041           inst->predicated ||
3042           inst->saturate ||
3043           inst->dst.file != GRF || inst->src[0].file != GRF ||
3044           inst->dst.type != inst->src[0].type)
3045          continue;
3046
3047       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3048
3049       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3050        * them: check for no writes to either one until the exit of the
3051        * program.
3052        */
3053       bool interfered = false;
3054       exec_list_iterator scan_iter = iter;
3055       scan_iter.next();
3056       for (; scan_iter.has_next(); scan_iter.next()) {
3057          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3058
3059          if (scan_inst->dst.file == GRF) {
3060             if (scan_inst->dst.reg == inst->dst.reg &&
3061                 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3062                  scan_inst->is_tex())) {
3063                interfered = true;
3064                break;
3065             }
3066             if (scan_inst->dst.reg == inst->src[0].reg &&
3067                 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3068                  scan_inst->is_tex())) {
3069                interfered = true;
3070                break;
3071             }
3072          }
3073
3074          /* The gen6 MATH instruction can't handle source modifiers, so avoid
3075           * coalescing those for now.  We should do something more specific.
3076           */
3077          if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3078             interfered = true;
3079             break;
3080          }
3081       }
3082       if (interfered) {
3083          continue;
3084       }
3085
3086       /* Rewrite the later usage to point at the source of the move to
3087        * be removed.
3088        */
3089       for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3090            scan_iter.next()) {
3091          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3092
3093          for (int i = 0; i < 3; i++) {
3094             if (scan_inst->src[i].file == GRF &&
3095                 scan_inst->src[i].reg == inst->dst.reg &&
3096                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3097                scan_inst->src[i].reg = inst->src[0].reg;
3098                scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3099                scan_inst->src[i].abs |= inst->src[0].abs;
3100                scan_inst->src[i].negate ^= inst->src[0].negate;
3101                scan_inst->src[i].smear = inst->src[0].smear;
3102             }
3103          }
3104       }
3105
3106       inst->remove();
3107       progress = true;
3108    }
3109
3110    if (progress)
3111       live_intervals_valid = false;
3112
3113    return progress;
3114 }
3115
3116
3117 bool
3118 fs_visitor::compute_to_mrf()
3119 {
3120    bool progress = false;
3121    int next_ip = 0;
3122
3123    calculate_live_intervals();
3124
3125    foreach_iter(exec_list_iterator, iter, this->instructions) {
3126       fs_inst *inst = (fs_inst *)iter.get();
3127
3128       int ip = next_ip;
3129       next_ip++;
3130
3131       if (inst->opcode != BRW_OPCODE_MOV ||
3132           inst->predicated ||
3133           inst->dst.file != MRF || inst->src[0].file != GRF ||
3134           inst->dst.type != inst->src[0].type ||
3135           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3136          continue;
3137
3138       /* Can't compute-to-MRF this GRF if someone else was going to
3139        * read it later.
3140        */
3141       if (this->virtual_grf_use[inst->src[0].reg] > ip)
3142          continue;
3143
3144       /* Found a move of a GRF to a MRF.  Let's see if we can go
3145        * rewrite the thing that made this GRF to write into the MRF.
3146        */
3147       fs_inst *scan_inst;
3148       for (scan_inst = (fs_inst *)inst->prev;
3149            scan_inst->prev != NULL;
3150            scan_inst = (fs_inst *)scan_inst->prev) {
3151          if (scan_inst->dst.file == GRF &&
3152              scan_inst->dst.reg == inst->src[0].reg) {
3153             /* Found the last thing to write our reg we want to turn
3154              * into a compute-to-MRF.
3155              */
3156
3157             if (scan_inst->is_tex()) {
3158                /* texturing writes several continuous regs, so we can't
3159                 * compute-to-mrf that.
3160                 */
3161                break;
3162             }
3163
3164             /* If it's predicated, it (probably) didn't populate all
3165              * the channels.
3166              */
3167             if (scan_inst->predicated)
3168                break;
3169
3170             /* SEND instructions can't have MRF as a destination. */
3171             if (scan_inst->mlen)
3172                break;
3173
3174             if (intel->gen >= 6) {
3175                /* gen6 math instructions must have the destination be
3176                 * GRF, so no compute-to-MRF for them.
3177                 */
3178                if (scan_inst->is_math()) {
3179                   break;
3180                }
3181             }
3182
3183             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3184                /* Found the creator of our MRF's source value. */
3185                scan_inst->dst.file = MRF;
3186                scan_inst->dst.hw_reg = inst->dst.hw_reg;
3187                scan_inst->saturate |= inst->saturate;
3188                inst->remove();
3189                progress = true;
3190             }
3191             break;
3192          }
3193
3194          /* We don't handle flow control here.  Most computation of
3195           * values that end up in MRFs are shortly before the MRF
3196           * write anyway.
3197           */
3198          if (scan_inst->opcode == BRW_OPCODE_DO ||
3199              scan_inst->opcode == BRW_OPCODE_WHILE ||
3200              scan_inst->opcode == BRW_OPCODE_ELSE ||
3201              scan_inst->opcode == BRW_OPCODE_ENDIF) {
3202             break;
3203          }
3204
3205          /* You can't read from an MRF, so if someone else reads our
3206           * MRF's source GRF that we wanted to rewrite, that stops us.
3207           */
3208          bool interfered = false;
3209          for (int i = 0; i < 3; i++) {
3210             if (scan_inst->src[i].file == GRF &&
3211                 scan_inst->src[i].reg == inst->src[0].reg &&
3212                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3213                interfered = true;
3214             }
3215          }
3216          if (interfered)
3217             break;
3218
3219          if (scan_inst->dst.file == MRF &&
3220              scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3221             /* Somebody else wrote our MRF here, so we can't can't
3222              * compute-to-MRF before that.
3223              */
3224             break;
3225          }
3226
3227          if (scan_inst->mlen > 0) {
3228             /* Found a SEND instruction, which means that there are
3229              * live values in MRFs from base_mrf to base_mrf +
3230              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3231              * above it.
3232              */
3233             if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3234                 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3235                break;
3236             }
3237          }
3238       }
3239    }
3240
3241    return progress;
3242 }
3243
3244 /**
3245  * Walks through basic blocks, locking for repeated MRF writes and
3246  * removing the later ones.
3247  */
3248 bool
3249 fs_visitor::remove_duplicate_mrf_writes()
3250 {
3251    fs_inst *last_mrf_move[16];
3252    bool progress = false;
3253
3254    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3255
3256    foreach_iter(exec_list_iterator, iter, this->instructions) {
3257       fs_inst *inst = (fs_inst *)iter.get();
3258
3259       switch (inst->opcode) {
3260       case BRW_OPCODE_DO:
3261       case BRW_OPCODE_WHILE:
3262       case BRW_OPCODE_IF:
3263       case BRW_OPCODE_ELSE:
3264       case BRW_OPCODE_ENDIF:
3265          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3266          continue;
3267       default:
3268          break;
3269       }
3270
3271       if (inst->opcode == BRW_OPCODE_MOV &&
3272           inst->dst.file == MRF) {
3273          fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3274          if (prev_inst && inst->equals(prev_inst)) {
3275             inst->remove();
3276             progress = true;
3277             continue;
3278          }
3279       }
3280
3281       /* Clear out the last-write records for MRFs that were overwritten. */
3282       if (inst->dst.file == MRF) {
3283          last_mrf_move[inst->dst.hw_reg] = NULL;
3284       }
3285
3286       if (inst->mlen > 0) {
3287          /* Found a SEND instruction, which will include two or fewer
3288           * implied MRF writes.  We could do better here.
3289           */
3290          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3291             last_mrf_move[inst->base_mrf + i] = NULL;
3292          }
3293       }
3294
3295       /* Clear out any MRF move records whose sources got overwritten. */
3296       if (inst->dst.file == GRF) {
3297          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3298             if (last_mrf_move[i] &&
3299                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3300                last_mrf_move[i] = NULL;
3301             }
3302          }
3303       }
3304
3305       if (inst->opcode == BRW_OPCODE_MOV &&
3306           inst->dst.file == MRF &&
3307           inst->src[0].file == GRF &&
3308           !inst->predicated) {
3309          last_mrf_move[inst->dst.hw_reg] = inst;
3310       }
3311    }
3312
3313    return progress;
3314 }
3315
3316 bool
3317 fs_visitor::virtual_grf_interferes(int a, int b)
3318 {
3319    int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3320    int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3321
3322    /* We can't handle dead register writes here, without iterating
3323     * over the whole instruction stream to find every single dead
3324     * write to that register to compare to the live interval of the
3325     * other register.  Just assert that dead_code_eliminate() has been
3326     * called.
3327     */
3328    assert((this->virtual_grf_use[a] != -1 ||
3329            this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3330           (this->virtual_grf_use[b] != -1 ||
3331            this->virtual_grf_def[b] == MAX_INSTRUCTION));
3332
3333    return start < end;
3334 }
3335
3336 static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3337 {
3338    struct brw_reg brw_reg;
3339
3340    switch (reg->file) {
3341    case GRF:
3342    case ARF:
3343    case MRF:
3344       if (reg->smear == -1) {
3345          brw_reg = brw_vec8_reg(reg->file,
3346                                 reg->hw_reg, 0);
3347       } else {
3348          brw_reg = brw_vec1_reg(reg->file,
3349                                 reg->hw_reg, reg->smear);
3350       }
3351       brw_reg = retype(brw_reg, reg->type);
3352       break;
3353    case IMM:
3354       switch (reg->type) {
3355       case BRW_REGISTER_TYPE_F:
3356          brw_reg = brw_imm_f(reg->imm.f);
3357          break;
3358       case BRW_REGISTER_TYPE_D:
3359          brw_reg = brw_imm_d(reg->imm.i);
3360          break;
3361       case BRW_REGISTER_TYPE_UD:
3362          brw_reg = brw_imm_ud(reg->imm.u);
3363          break;
3364       default:
3365          assert(!"not reached");
3366          brw_reg = brw_null_reg();
3367          break;
3368       }
3369       break;
3370    case FIXED_HW_REG:
3371       brw_reg = reg->fixed_hw_reg;
3372       break;
3373    case BAD_FILE:
3374       /* Probably unused. */
3375       brw_reg = brw_null_reg();
3376       break;
3377    case UNIFORM:
3378       assert(!"not reached");
3379       brw_reg = brw_null_reg();
3380       break;
3381    default:
3382       assert(!"not reached");
3383       brw_reg = brw_null_reg();
3384       break;
3385    }
3386    if (reg->abs)
3387       brw_reg = brw_abs(brw_reg);
3388    if (reg->negate)
3389       brw_reg = negate(brw_reg);
3390
3391    return brw_reg;
3392 }
3393
3394 void
3395 fs_visitor::generate_code()
3396 {
3397    int last_native_inst = 0;
3398    const char *last_annotation_string = NULL;
3399    ir_instruction *last_annotation_ir = NULL;
3400
3401    int if_stack_array_size = 16;
3402    int loop_stack_array_size = 16;
3403    int if_stack_depth = 0, loop_stack_depth = 0;
3404    brw_instruction **if_stack =
3405       rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3406    brw_instruction **loop_stack =
3407       rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3408    int *if_depth_in_loop =
3409       rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3410
3411
3412    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3413       printf("Native code for fragment shader %d:\n",
3414              ctx->Shader.CurrentFragmentProgram->Name);
3415    }
3416
3417    foreach_iter(exec_list_iterator, iter, this->instructions) {
3418       fs_inst *inst = (fs_inst *)iter.get();
3419       struct brw_reg src[3], dst;
3420
3421       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3422          if (last_annotation_ir != inst->ir) {
3423             last_annotation_ir = inst->ir;
3424             if (last_annotation_ir) {
3425                printf("   ");
3426                last_annotation_ir->print();
3427                printf("\n");
3428             }
3429          }
3430          if (last_annotation_string != inst->annotation) {
3431             last_annotation_string = inst->annotation;
3432             if (last_annotation_string)
3433                printf("   %s\n", last_annotation_string);
3434          }
3435       }
3436
3437       for (unsigned int i = 0; i < 3; i++) {
3438          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3439       }
3440       dst = brw_reg_from_fs_reg(&inst->dst);
3441
3442       brw_set_conditionalmod(p, inst->conditional_mod);
3443       brw_set_predicate_control(p, inst->predicated);
3444       brw_set_saturate(p, inst->saturate);
3445
3446       switch (inst->opcode) {
3447       case BRW_OPCODE_MOV:
3448          brw_MOV(p, dst, src[0]);
3449          break;
3450       case BRW_OPCODE_ADD:
3451          brw_ADD(p, dst, src[0], src[1]);
3452          break;
3453       case BRW_OPCODE_MUL:
3454          brw_MUL(p, dst, src[0], src[1]);
3455          break;
3456
3457       case BRW_OPCODE_FRC:
3458          brw_FRC(p, dst, src[0]);
3459          break;
3460       case BRW_OPCODE_RNDD:
3461          brw_RNDD(p, dst, src[0]);
3462          break;
3463       case BRW_OPCODE_RNDE:
3464          brw_RNDE(p, dst, src[0]);
3465          break;
3466       case BRW_OPCODE_RNDZ:
3467          brw_RNDZ(p, dst, src[0]);
3468          break;
3469
3470       case BRW_OPCODE_AND:
3471          brw_AND(p, dst, src[0], src[1]);
3472          break;
3473       case BRW_OPCODE_OR:
3474          brw_OR(p, dst, src[0], src[1]);
3475          break;
3476       case BRW_OPCODE_XOR:
3477          brw_XOR(p, dst, src[0], src[1]);
3478          break;
3479       case BRW_OPCODE_NOT:
3480          brw_NOT(p, dst, src[0]);
3481          break;
3482       case BRW_OPCODE_ASR:
3483          brw_ASR(p, dst, src[0], src[1]);
3484          break;
3485       case BRW_OPCODE_SHR:
3486          brw_SHR(p, dst, src[0], src[1]);
3487          break;
3488       case BRW_OPCODE_SHL:
3489          brw_SHL(p, dst, src[0], src[1]);
3490          break;
3491
3492       case BRW_OPCODE_CMP:
3493          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3494          break;
3495       case BRW_OPCODE_SEL:
3496          brw_SEL(p, dst, src[0], src[1]);
3497          break;
3498
3499       case BRW_OPCODE_IF:
3500          if (inst->src[0].file != BAD_FILE) {
3501             assert(intel->gen >= 6);
3502             if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3503          } else {
3504             if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3505          }
3506          if_depth_in_loop[loop_stack_depth]++;
3507          if_stack_depth++;
3508          if (if_stack_array_size <= if_stack_depth) {
3509             if_stack_array_size *= 2;
3510             if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3511                                 if_stack_array_size);
3512          }
3513          break;
3514
3515       case BRW_OPCODE_ELSE:
3516          if_stack[if_stack_depth - 1] =
3517             brw_ELSE(p, if_stack[if_stack_depth - 1]);
3518          break;
3519       case BRW_OPCODE_ENDIF:
3520          if_stack_depth--;
3521          brw_ENDIF(p , if_stack[if_stack_depth]);
3522          if_depth_in_loop[loop_stack_depth]--;
3523          break;
3524
3525       case BRW_OPCODE_DO:
3526          loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3527          if (loop_stack_array_size <= loop_stack_depth) {
3528             loop_stack_array_size *= 2;
3529             loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3530                                   loop_stack_array_size);
3531             if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3532                                         loop_stack_array_size);
3533          }
3534          if_depth_in_loop[loop_stack_depth] = 0;
3535          break;
3536
3537       case BRW_OPCODE_BREAK:
3538          brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3539          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3540          break;
3541       case BRW_OPCODE_CONTINUE:
3542          /* FINISHME: We need to write the loop instruction support still. */
3543          if (intel->gen >= 6)
3544             gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3545          else
3546             brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3547          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3548          break;
3549
3550       case BRW_OPCODE_WHILE: {
3551          struct brw_instruction *inst0, *inst1;
3552          GLuint br = 1;
3553
3554          if (intel->gen >= 5)
3555             br = 2;
3556
3557          assert(loop_stack_depth > 0);
3558          loop_stack_depth--;
3559          inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3560          if (intel->gen < 6) {
3561             /* patch all the BREAK/CONT instructions from last BGNLOOP */
3562             while (inst0 > loop_stack[loop_stack_depth]) {
3563                inst0--;
3564                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3565                    inst0->bits3.if_else.jump_count == 0) {
3566                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3567             }
3568                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3569                         inst0->bits3.if_else.jump_count == 0) {
3570                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3571                }
3572             }
3573          }
3574       }
3575          break;
3576
3577       case FS_OPCODE_RCP:
3578       case FS_OPCODE_RSQ:
3579       case FS_OPCODE_SQRT:
3580       case FS_OPCODE_EXP2:
3581       case FS_OPCODE_LOG2:
3582       case FS_OPCODE_POW:
3583       case FS_OPCODE_SIN:
3584       case FS_OPCODE_COS:
3585          generate_math(inst, dst, src);
3586          break;
3587       case FS_OPCODE_CINTERP:
3588          brw_MOV(p, dst, src[0]);
3589          break;
3590       case FS_OPCODE_LINTERP:
3591          generate_linterp(inst, dst, src);
3592          break;
3593       case FS_OPCODE_TEX:
3594       case FS_OPCODE_TXB:
3595       case FS_OPCODE_TXD:
3596       case FS_OPCODE_TXL:
3597          generate_tex(inst, dst, src[0]);
3598          break;
3599       case FS_OPCODE_DISCARD_NOT:
3600          generate_discard_not(inst, dst);
3601          break;
3602       case FS_OPCODE_DISCARD_AND:
3603          generate_discard_and(inst, src[0]);
3604          break;
3605       case FS_OPCODE_DDX:
3606          generate_ddx(inst, dst, src[0]);
3607          break;
3608       case FS_OPCODE_DDY:
3609          generate_ddy(inst, dst, src[0]);
3610          break;
3611
3612       case FS_OPCODE_SPILL:
3613          generate_spill(inst, src[0]);
3614          break;
3615
3616       case FS_OPCODE_UNSPILL:
3617          generate_unspill(inst, dst);
3618          break;
3619
3620       case FS_OPCODE_PULL_CONSTANT_LOAD:
3621          generate_pull_constant_load(inst, dst);
3622          break;
3623
3624       case FS_OPCODE_FB_WRITE:
3625          generate_fb_write(inst);
3626          break;
3627       default:
3628          if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3629             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3630                           brw_opcodes[inst->opcode].name);
3631          } else {
3632             _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3633          }
3634          this->fail = true;
3635       }
3636
3637       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3638          for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3639             if (0) {
3640                printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3641                       ((uint32_t *)&p->store[i])[3],
3642                       ((uint32_t *)&p->store[i])[2],
3643                       ((uint32_t *)&p->store[i])[1],
3644                       ((uint32_t *)&p->store[i])[0]);
3645             }
3646             brw_disasm(stdout, &p->store[i], intel->gen);
3647          }
3648       }
3649
3650       last_native_inst = p->nr_insn;
3651    }
3652
3653    ralloc_free(if_stack);
3654    ralloc_free(loop_stack);
3655    ralloc_free(if_depth_in_loop);
3656
3657    brw_set_uip_jip(p);
3658
3659    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3660     * emit issues, it doesn't get the jump distances into the output,
3661     * which is often something we want to debug.  So this is here in
3662     * case you're doing that.
3663     */
3664    if (0) {
3665       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3666          for (unsigned int i = 0; i < p->nr_insn; i++) {
3667             printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3668                    ((uint32_t *)&p->store[i])[3],
3669                    ((uint32_t *)&p->store[i])[2],
3670                    ((uint32_t *)&p->store[i])[1],
3671                    ((uint32_t *)&p->store[i])[0]);
3672             brw_disasm(stdout, &p->store[i], intel->gen);
3673          }
3674       }
3675    }
3676 }
3677
3678 GLboolean
3679 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3680 {
3681    struct intel_context *intel = &brw->intel;
3682    struct gl_context *ctx = &intel->ctx;
3683    struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3684
3685    if (!prog)
3686       return GL_FALSE;
3687
3688    struct brw_shader *shader =
3689      (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3690    if (!shader)
3691       return GL_FALSE;
3692
3693    /* We always use 8-wide mode, at least for now.  For one, flow
3694     * control only works in 8-wide.  Also, when we're fragment shader
3695     * bound, we're almost always under register pressure as well, so
3696     * 8-wide would save us from the performance cliff of spilling
3697     * regs.
3698     */
3699    c->dispatch_width = 8;
3700
3701    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3702       printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3703       _mesa_print_ir(shader->ir, NULL);
3704       printf("\n");
3705    }
3706
3707    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3708     */
3709    fs_visitor v(c, shader);
3710
3711    if (0) {
3712       v.emit_dummy_fs();
3713    } else {
3714       v.calculate_urb_setup();
3715       if (intel->gen < 6)
3716          v.emit_interpolation_setup_gen4();
3717       else
3718          v.emit_interpolation_setup_gen6();
3719
3720       /* Generate FS IR for main().  (the visitor only descends into
3721        * functions called "main").
3722        */
3723       foreach_iter(exec_list_iterator, iter, *shader->ir) {
3724          ir_instruction *ir = (ir_instruction *)iter.get();
3725          v.base_ir = ir;
3726          ir->accept(&v);
3727       }
3728
3729       v.emit_fb_writes();
3730
3731       v.split_virtual_grfs();
3732
3733       v.setup_paramvalues_refs();
3734       v.setup_pull_constants();
3735
3736       bool progress;
3737       do {
3738          progress = false;
3739
3740          progress = v.remove_duplicate_mrf_writes() || progress;
3741
3742          progress = v.propagate_constants() || progress;
3743          progress = v.register_coalesce() || progress;
3744          progress = v.compute_to_mrf() || progress;
3745          progress = v.dead_code_eliminate() || progress;
3746       } while (progress);
3747
3748       v.schedule_instructions();
3749
3750       v.assign_curb_setup();
3751       v.assign_urb_setup();
3752
3753       if (0) {
3754          /* Debug of register spilling: Go spill everything. */
3755          int virtual_grf_count = v.virtual_grf_next;
3756          for (int i = 1; i < virtual_grf_count; i++) {
3757             v.spill_reg(i);
3758          }
3759       }
3760
3761       if (0)
3762          v.assign_regs_trivial();
3763       else {
3764          while (!v.assign_regs()) {
3765             if (v.fail)
3766                break;
3767          }
3768       }
3769    }
3770
3771    if (!v.fail)
3772       v.generate_code();
3773
3774    assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3775
3776    if (v.fail)
3777       return GL_FALSE;
3778
3779    c->prog_data.total_grf = v.grf_used;
3780
3781    return GL_TRUE;
3782 }