src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/uniforms.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "program/prog_optimize.h"
  40 #include "program/register_allocate.h"
  41 #include "program/sampler.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "glsl/glsl_types.h"
  49 #include "glsl/ir_optimization.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_visitor::visit(ir_variable *ir)
  54 {
  55    fs_reg *reg = NULL;
  56
  57    if (variable_storage(ir))
  58       return;
  59
  60    if (ir->mode == ir_var_in) {
  61       if (!strcmp(ir->name, "gl_FragCoord")) {
  62          reg = emit_fragcoord_interpolation(ir);
  63       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
  64          reg = emit_frontfacing_interpolation(ir);
  65       } else {
  66          reg = emit_general_interpolation(ir);
  67       }
  68       assert(reg);
  69       hash_table_insert(this->variable_ht, reg, ir);
  70       return;
  71    } else if (ir->mode == ir_var_out) {
  72       reg = new(this->mem_ctx) fs_reg(this, ir->type);
  73
  74       if (ir->index > 0) {
  75          assert(ir->location == FRAG_RESULT_DATA0);
  76          assert(ir->index == 1);
  77          this->dual_src_output = *reg;
  78       } else if (ir->location == FRAG_RESULT_COLOR) {
  79          /* Writing gl_FragColor outputs to all color regions. */
  80          for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
  81             this->outputs[i] = *reg;
  82             this->output_components[i] = 4;
  83          }
  84       } else if (ir->location == FRAG_RESULT_DEPTH) {
  85          this->frag_depth = *reg;
  86       } else {
  87          /* gl_FragData or a user-defined FS output */
  88          assert(ir->location >= FRAG_RESULT_DATA0 &&
  89                 ir->location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
  90
  91          int vector_elements =
  92             ir->type->is_array() ? ir->type->fields.array->vector_elements
  93                                  : ir->type->vector_elements;
  94
  95          /* General color output. */
  96          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
  97             int output = ir->location - FRAG_RESULT_DATA0 + i;
  98             this->outputs[output] = *reg;
  99             this->outputs[output].reg_offset += vector_elements * i;
 100             this->output_components[output] = vector_elements;
 101          }
 102       }
 103    } else if (ir->mode == ir_var_uniform) {
 104       int param_index = c->prog_data.nr_params;
 105
 106       /* Thanks to the lower_ubo_reference pass, we will see only
 107        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 108        * variables, so no need for them to be in variable_ht.
 109        */
 110       if (ir->uniform_block != -1)
 111          return;
 112
 113       if (dispatch_width == 16) {
 114          if (!variable_storage(ir)) {
 115             fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
 116          }
 117          return;
 118       }
 119
 120       if (!strncmp(ir->name, "gl_", 3)) {
 121          setup_builtin_uniform_values(ir);
 122       } else {
 123          setup_uniform_values(ir->location, ir->type);
 124       }
 125
 126       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 127       reg->type = brw_type_for_base_type(ir->type);
 128    }
 129
 130    if (!reg)
 131       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 132
 133    hash_table_insert(this->variable_ht, reg, ir);
 134 }
 135
 136 void
 137 fs_visitor::visit(ir_dereference_variable *ir)
 138 {
 139    fs_reg *reg = variable_storage(ir->var);
 140    this->result = *reg;
 141 }
 142
 143 void
 144 fs_visitor::visit(ir_dereference_record *ir)
 145 {
 146    const glsl_type *struct_type = ir->record->type;
 147
 148    ir->record->accept(this);
 149
 150    unsigned int offset = 0;
 151    for (unsigned int i = 0; i < struct_type->length; i++) {
 152       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 153          break;
 154       offset += type_size(struct_type->fields.structure[i].type);
 155    }
 156    this->result.reg_offset += offset;
 157    this->result.type = brw_type_for_base_type(ir->type);
 158 }
 159
 160 void
 161 fs_visitor::visit(ir_dereference_array *ir)
 162 {
 163    ir_constant *index;
 164    int element_size;
 165
 166    ir->array->accept(this);
 167    index = ir->array_index->as_constant();
 168
 169    element_size = type_size(ir->type);
 170    this->result.type = brw_type_for_base_type(ir->type);
 171
 172    if (index) {
 173       assert(this->result.file == UNIFORM || this->result.file == GRF);
 174       this->result.reg_offset += index->value.i[0] * element_size;
 175    } else {
 176       assert(!"FINISHME: non-constant array element");
 177    }
 178 }
 179
 180 void
 181 fs_visitor::emit_minmax(uint32_t conditionalmod, fs_reg dst,
 182                         fs_reg src0, fs_reg src1)
 183 {
 184    fs_inst *inst;
 185
 186    if (intel->gen >= 6) {
 187       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 188       inst->conditional_mod = conditionalmod;
 189    } else {
 190       emit(CMP(reg_null_d, src0, src1, conditionalmod));
 191
 192       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 193       inst->predicate = BRW_PREDICATE_NORMAL;
 194    }
 195 }
 196
 197 /* Instruction selection: Produce a MOV.sat instead of
 198  * MIN(MAX(val, 0), 1) when possible.
 199  */
 200 bool
 201 fs_visitor::try_emit_saturate(ir_expression *ir)
 202 {
 203    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 204
 205    if (!sat_val)
 206       return false;
 207
 208    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 209
 210    sat_val->accept(this);
 211    fs_reg src = this->result;
 212
 213    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 214
 215    /* If the last instruction from our accept() didn't generate our
 216     * src, generate a saturated MOV
 217     */
 218    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 219    if (!modify || modify->regs_written() != 1) {
 220       this->result = fs_reg(this, ir->type);
 221       fs_inst *inst = emit(MOV(this->result, src));
 222       inst->saturate = true;
 223    } else {
 224       modify->saturate = true;
 225       this->result = src;
 226    }
 227
 228
 229    return true;
 230 }
 231
 232 bool
 233 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
 234 {
 235    /* 3-src instructions were introduced in gen6. */
 236    if (intel->gen < 6)
 237       return false;
 238
 239    /* MAD can only handle floating-point data. */
 240    if (ir->type != glsl_type::float_type)
 241       return false;
 242
 243    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
 244    ir_expression *mul = ir->operands[mul_arg]->as_expression();
 245
 246    if (!mul || mul->operation != ir_binop_mul)
 247       return false;
 248
 249    if (nonmul->as_constant() ||
 250        mul->operands[0]->as_constant() ||
 251        mul->operands[1]->as_constant())
 252       return false;
 253
 254    nonmul->accept(this);
 255    fs_reg src0 = this->result;
 256
 257    mul->operands[0]->accept(this);
 258    fs_reg src1 = this->result;
 259
 260    mul->operands[1]->accept(this);
 261    fs_reg src2 = this->result;
 262
 263    this->result = fs_reg(this, ir->type);
 264    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 265
 266    return true;
 267 }
 268
 269 void
 270 fs_visitor::visit(ir_expression *ir)
 271 {
 272    unsigned int operand;
 273    fs_reg op[2], temp;
 274    fs_inst *inst;
 275
 276    assert(ir->get_num_operands() <= 2);
 277
 278    if (try_emit_saturate(ir))
 279       return;
 280    if (ir->operation == ir_binop_add) {
 281       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
 282          return;
 283    }
 284
 285    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 286       ir->operands[operand]->accept(this);
 287       if (this->result.file == BAD_FILE) {
 288          ir_print_visitor v;
 289          fail("Failed to get tree for expression operand:\n");
 290          ir->operands[operand]->accept(&v);
 291       }
 292       op[operand] = this->result;
 293
 294       /* Matrix expression operands should have been broken down to vector
 295        * operations already.
 296        */
 297       assert(!ir->operands[operand]->type->is_matrix());
 298       /* And then those vector operands should have been broken down to scalar.
 299        */
 300       assert(!ir->operands[operand]->type->is_vector());
 301    }
 302
 303    /* Storage for our result.  If our result goes into an assignment, it will
 304     * just get copy-propagated out, so no worries.
 305     */
 306    this->result = fs_reg(this, ir->type);
 307
 308    switch (ir->operation) {
 309    case ir_unop_logic_not:
 310       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 311        * ones complement of the whole register, not just bit 0.
 312        */
 313       emit(XOR(this->result, op[0], fs_reg(1)));
 314       break;
 315    case ir_unop_neg:
 316       op[0].negate = !op[0].negate;
 317       this->result = op[0];
 318       break;
 319    case ir_unop_abs:
 320       op[0].abs = true;
 321       op[0].negate = false;
 322       this->result = op[0];
 323       break;
 324    case ir_unop_sign:
 325       temp = fs_reg(this, ir->type);
 326
 327       emit(MOV(this->result, fs_reg(0.0f)));
 328
 329       emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_G));
 330       inst = emit(MOV(this->result, fs_reg(1.0f)));
 331       inst->predicate = BRW_PREDICATE_NORMAL;
 332
 333       emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_L));
 334       inst = emit(MOV(this->result, fs_reg(-1.0f)));
 335       inst->predicate = BRW_PREDICATE_NORMAL;
 336
 337       break;
 338    case ir_unop_rcp:
 339       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 340       break;
 341
 342    case ir_unop_exp2:
 343       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 344       break;
 345    case ir_unop_log2:
 346       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 347       break;
 348    case ir_unop_exp:
 349    case ir_unop_log:
 350       assert(!"not reached: should be handled by ir_explog_to_explog2");
 351       break;
 352    case ir_unop_sin:
 353    case ir_unop_sin_reduced:
 354       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 355       break;
 356    case ir_unop_cos:
 357    case ir_unop_cos_reduced:
 358       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 359       break;
 360
 361    case ir_unop_dFdx:
 362       emit(FS_OPCODE_DDX, this->result, op[0]);
 363       break;
 364    case ir_unop_dFdy:
 365       emit(FS_OPCODE_DDY, this->result, op[0]);
 366       break;
 367
 368    case ir_binop_add:
 369       emit(ADD(this->result, op[0], op[1]));
 370       break;
 371    case ir_binop_sub:
 372       assert(!"not reached: should be handled by ir_sub_to_add_neg");
 373       break;
 374
 375    case ir_binop_mul:
 376       if (ir->type->is_integer()) {
 377          /* For integer multiplication, the MUL uses the low 16 bits
 378           * of one of the operands (src0 on gen6, src1 on gen7).  The
 379           * MACH accumulates in the contribution of the upper 16 bits
 380           * of that operand.
 381           *
 382           * FINISHME: Emit just the MUL if we know an operand is small
 383           * enough.
 384           */
 385          if (intel->gen >= 7 && dispatch_width == 16)
 386             fail("16-wide explicit accumulator operands unsupported\n");
 387
 388          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
 389
 390          emit(MUL(acc, op[0], op[1]));
 391          emit(MACH(reg_null_d, op[0], op[1]));
 392          emit(MOV(this->result, fs_reg(acc)));
 393       } else {
 394          emit(MUL(this->result, op[0], op[1]));
 395       }
 396       break;
 397    case ir_binop_div:
 398       if (intel->gen >= 7 && dispatch_width == 16)
 399          fail("16-wide INTDIV unsupported\n");
 400
 401       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 402       assert(ir->type->is_integer());
 403       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 404       break;
 405    case ir_binop_mod:
 406       if (intel->gen >= 7 && dispatch_width == 16)
 407          fail("16-wide INTDIV unsupported\n");
 408
 409       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
 410       assert(ir->type->is_integer());
 411       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 412       break;
 413
 414    case ir_binop_less:
 415    case ir_binop_greater:
 416    case ir_binop_lequal:
 417    case ir_binop_gequal:
 418    case ir_binop_equal:
 419    case ir_binop_all_equal:
 420    case ir_binop_nequal:
 421    case ir_binop_any_nequal:
 422       resolve_bool_comparison(ir->operands[0], &op[0]);
 423       resolve_bool_comparison(ir->operands[1], &op[1]);
 424
 425       emit(CMP(this->result, op[0], op[1],
 426                brw_conditional_for_comparison(ir->operation)));
 427       break;
 428
 429    case ir_binop_logic_xor:
 430       emit(XOR(this->result, op[0], op[1]));
 431       break;
 432
 433    case ir_binop_logic_or:
 434       emit(OR(this->result, op[0], op[1]));
 435       break;
 436
 437    case ir_binop_logic_and:
 438       emit(AND(this->result, op[0], op[1]));
 439       break;
 440
 441    case ir_binop_dot:
 442    case ir_unop_any:
 443       assert(!"not reached: should be handled by brw_fs_channel_expressions");
 444       break;
 445
 446    case ir_unop_noise:
 447       assert(!"not reached: should be handled by lower_noise");
 448       break;
 449
 450    case ir_quadop_vector:
 451       assert(!"not reached: should be handled by lower_quadop_vector");
 452       break;
 453
 454    case ir_unop_sqrt:
 455       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 456       break;
 457
 458    case ir_unop_rsq:
 459       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 460       break;
 461
 462    case ir_unop_bitcast_i2f:
 463    case ir_unop_bitcast_u2f:
 464       op[0].type = BRW_REGISTER_TYPE_F;
 465       this->result = op[0];
 466       break;
 467    case ir_unop_i2u:
 468    case ir_unop_bitcast_f2u:
 469       op[0].type = BRW_REGISTER_TYPE_UD;
 470       this->result = op[0];
 471       break;
 472    case ir_unop_u2i:
 473    case ir_unop_bitcast_f2i:
 474       op[0].type = BRW_REGISTER_TYPE_D;
 475       this->result = op[0];
 476       break;
 477    case ir_unop_i2f:
 478    case ir_unop_u2f:
 479    case ir_unop_f2i:
 480    case ir_unop_f2u:
 481       emit(MOV(this->result, op[0]));
 482       break;
 483
 484    case ir_unop_b2i:
 485       inst = emit(AND(this->result, op[0], fs_reg(1)));
 486       break;
 487    case ir_unop_b2f:
 488       temp = fs_reg(this, glsl_type::int_type);
 489       emit(AND(temp, op[0], fs_reg(1)));
 490       emit(MOV(this->result, temp));
 491       break;
 492
 493    case ir_unop_f2b:
 494    case ir_unop_i2b:
 495       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 496       break;
 497
 498    case ir_unop_trunc:
 499       emit(RNDZ(this->result, op[0]));
 500       break;
 501    case ir_unop_ceil:
 502       op[0].negate = !op[0].negate;
 503       inst = emit(RNDD(this->result, op[0]));
 504       this->result.negate = true;
 505       break;
 506    case ir_unop_floor:
 507       inst = emit(RNDD(this->result, op[0]));
 508       break;
 509    case ir_unop_fract:
 510       inst = emit(FRC(this->result, op[0]));
 511       break;
 512    case ir_unop_round_even:
 513       emit(RNDE(this->result, op[0]));
 514       break;
 515
 516    case ir_binop_min:
 517    case ir_binop_max:
 518       resolve_ud_negate(&op[0]);
 519       resolve_ud_negate(&op[1]);
 520       emit_minmax(ir->operation == ir_binop_min ?
 521                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
 522                   this->result, op[0], op[1]);
 523       break;
 524
 525    case ir_binop_pow:
 526       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
 527       break;
 528
 529    case ir_unop_bit_not:
 530       inst = emit(NOT(this->result, op[0]));
 531       break;
 532    case ir_binop_bit_and:
 533       inst = emit(AND(this->result, op[0], op[1]));
 534       break;
 535    case ir_binop_bit_xor:
 536       inst = emit(XOR(this->result, op[0], op[1]));
 537       break;
 538    case ir_binop_bit_or:
 539       inst = emit(OR(this->result, op[0], op[1]));
 540       break;
 541
 542    case ir_binop_lshift:
 543       inst = emit(SHL(this->result, op[0], op[1]));
 544       break;
 545
 546    case ir_binop_rshift:
 547       if (ir->type->base_type == GLSL_TYPE_INT)
 548          inst = emit(ASR(this->result, op[0], op[1]));
 549       else
 550          inst = emit(SHR(this->result, op[0], op[1]));
 551       break;
 552
 553    case ir_binop_ubo_load:
 554       ir_constant *uniform_block = ir->operands[0]->as_constant();
 555       ir_constant *offset = ir->operands[1]->as_constant();
 556
 557       fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
 558       packed_consts.type = result.type;
 559       fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_WM_UBO(uniform_block->value.u[0]));
 560       fs_inst *pull = emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
 561                                    packed_consts,
 562                                    surf_index,
 563                                    fs_reg(offset->value.u[0])));
 564       pull->base_mrf = 14;
 565       pull->mlen = 1;
 566
 567       packed_consts.smear = offset->value.u[0] % 16 / 4;
 568       for (int i = 0; i < ir->type->vector_elements; i++) {
 569          /* UBO bools are any nonzero value.  We consider bools to be
 570           * values with the low bit set to 1.  Convert them using CMP.
 571           */
 572          if (ir->type->base_type == GLSL_TYPE_BOOL) {
 573             emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
 574          } else {
 575             emit(MOV(result, packed_consts));
 576          }
 577
 578          packed_consts.smear++;
 579          result.reg_offset++;
 580
 581          /* The std140 packing rules don't allow vectors to cross 16-byte
 582           * boundaries, and a reg is 32 bytes.
 583           */
 584          assert(packed_consts.smear < 8);
 585       }
 586       result.reg_offset = 0;
 587       break;
 588    }
 589 }
 590
 591 void
 592 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
 593                                    const glsl_type *type, bool predicated)
 594 {
 595    switch (type->base_type) {
 596    case GLSL_TYPE_FLOAT:
 597    case GLSL_TYPE_UINT:
 598    case GLSL_TYPE_INT:
 599    case GLSL_TYPE_BOOL:
 600       for (unsigned int i = 0; i < type->components(); i++) {
 601          l.type = brw_type_for_base_type(type);
 602          r.type = brw_type_for_base_type(type);
 603
 604          if (predicated || !l.equals(r)) {
 605             fs_inst *inst = emit(MOV(l, r));
 606             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
 607          }
 608
 609          l.reg_offset++;
 610          r.reg_offset++;
 611       }
 612       break;
 613    case GLSL_TYPE_ARRAY:
 614       for (unsigned int i = 0; i < type->length; i++) {
 615          emit_assignment_writes(l, r, type->fields.array, predicated);
 616       }
 617       break;
 618
 619    case GLSL_TYPE_STRUCT:
 620       for (unsigned int i = 0; i < type->length; i++) {
 621          emit_assignment_writes(l, r, type->fields.structure[i].type,
 622                                 predicated);
 623       }
 624       break;
 625
 626    case GLSL_TYPE_SAMPLER:
 627       break;
 628
 629    default:
 630       assert(!"not reached");
 631       break;
 632    }
 633 }
 634
 635 /* If the RHS processing resulted in an instruction generating a
 636  * temporary value, and it would be easy to rewrite the instruction to
 637  * generate its result right into the LHS instead, do so.  This ends
 638  * up reliably removing instructions where it can be tricky to do so
 639  * later without real UD chain information.
 640  */
 641 bool
 642 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
 643                                    fs_reg dst,
 644                                    fs_reg src,
 645                                    fs_inst *pre_rhs_inst,
 646                                    fs_inst *last_rhs_inst)
 647 {
 648    /* Only attempt if we're doing a direct assignment. */
 649    if (ir->condition ||
 650        !(ir->lhs->type->is_scalar() ||
 651         (ir->lhs->type->is_vector() &&
 652          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
 653       return false;
 654
 655    /* Make sure the last instruction generated our source reg. */
 656    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
 657                                                     last_rhs_inst,
 658                                                     src);
 659    if (!modify)
 660       return false;
 661
 662    /* If last_rhs_inst wrote a different number of components than our LHS,
 663     * we can't safely rewrite it.
 664     */
 665    if (ir->lhs->type->vector_elements != modify->regs_written())
 666       return false;
 667
 668    /* Success!  Rewrite the instruction. */
 669    modify->dst = dst;
 670
 671    return true;
 672 }
 673
 674 void
 675 fs_visitor::visit(ir_assignment *ir)
 676 {
 677    fs_reg l, r;
 678    fs_inst *inst;
 679
 680    /* FINISHME: arrays on the lhs */
 681    ir->lhs->accept(this);
 682    l = this->result;
 683
 684    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
 685
 686    ir->rhs->accept(this);
 687    r = this->result;
 688
 689    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
 690
 691    assert(l.file != BAD_FILE);
 692    assert(r.file != BAD_FILE);
 693
 694    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
 695       return;
 696
 697    if (ir->condition) {
 698       emit_bool_to_cond_code(ir->condition);
 699    }
 700
 701    if (ir->lhs->type->is_scalar() ||
 702        ir->lhs->type->is_vector()) {
 703       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
 704          if (ir->write_mask & (1 << i)) {
 705             inst = emit(MOV(l, r));
 706             if (ir->condition)
 707                inst->predicate = BRW_PREDICATE_NORMAL;
 708             r.reg_offset++;
 709          }
 710          l.reg_offset++;
 711       }
 712    } else {
 713       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
 714    }
 715 }
 716
 717 fs_inst *
 718 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 719                               fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
 720 {
 721    int mlen;
 722    int base_mrf = 1;
 723    bool simd16 = false;
 724    fs_reg orig_dst;
 725
 726    /* g0 header. */
 727    mlen = 1;
 728
 729    if (ir->shadow_comparitor) {
 730       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 731          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 732          coordinate.reg_offset++;
 733       }
 734       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 735       mlen += 3;
 736
 737       if (ir->op == ir_tex) {
 738          /* There's no plain shadow compare message, so we use shadow
 739           * compare with a bias of 0.0.
 740           */
 741          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
 742          mlen++;
 743       } else if (ir->op == ir_txb || ir->op == ir_txl) {
 744          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
 745          mlen++;
 746       } else {
 747          assert(!"Should not get here.");
 748       }
 749
 750       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
 751       mlen++;
 752    } else if (ir->op == ir_tex) {
 753       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 754          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 755          coordinate.reg_offset++;
 756       }
 757       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 758       mlen += 3;
 759    } else if (ir->op == ir_txd) {
 760       fs_reg &dPdx = lod;
 761
 762       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 763          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
 764          coordinate.reg_offset++;
 765       }
 766       /* the slots for u and v are always present, but r is optional */
 767       mlen += MAX2(ir->coordinate->type->vector_elements, 2);
 768
 769       /*  P   = u, v, r
 770        * dPdx = dudx, dvdx, drdx
 771        * dPdy = dudy, dvdy, drdy
 772        *
 773        * 1-arg: Does not exist.
 774        *
 775        * 2-arg: dudx   dvdx   dudy   dvdy
 776        *        dPdx.x dPdx.y dPdy.x dPdy.y
 777        *        m4     m5     m6     m7
 778        *
 779        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
 780        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
 781        *        m5     m6     m7     m8     m9     m10
 782        */
 783       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
 784          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
 785          dPdx.reg_offset++;
 786       }
 787       mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
 788
 789       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
 790          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
 791          dPdy.reg_offset++;
 792       }
 793       mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
 794    } else if (ir->op == ir_txs) {
 795       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
 796       simd16 = true;
 797       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
 798       mlen += 2;
 799    } else {
 800       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
 801        * instructions.  We'll need to do SIMD16 here.
 802        */
 803       simd16 = true;
 804       assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
 805
 806       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 807          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
 808                   coordinate));
 809          coordinate.reg_offset++;
 810       }
 811
 812       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
 813        * be necessary for TXF (ld), but seems wise to do for all messages.
 814        */
 815       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
 816          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
 817       }
 818
 819       /* lod/bias appears after u/v/r. */
 820       mlen += 6;
 821
 822       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
 823       mlen++;
 824
 825       /* The unused upper half. */
 826       mlen++;
 827    }
 828
 829    if (simd16) {
 830       /* Now, since we're doing simd16, the return is 2 interleaved
 831        * vec4s where the odd-indexed ones are junk. We'll need to move
 832        * this weirdness around to the expected layout.
 833        */
 834       orig_dst = dst;
 835       const glsl_type *vec_type =
 836          glsl_type::get_instance(ir->type->base_type, 4, 1);
 837       dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
 838       dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
 839                                : BRW_REGISTER_TYPE_F;
 840    }
 841
 842    fs_inst *inst = NULL;
 843    switch (ir->op) {
 844    case ir_tex:
 845       inst = emit(SHADER_OPCODE_TEX, dst);
 846       break;
 847    case ir_txb:
 848       inst = emit(FS_OPCODE_TXB, dst);
 849       break;
 850    case ir_txl:
 851       inst = emit(SHADER_OPCODE_TXL, dst);
 852       break;
 853    case ir_txd:
 854       inst = emit(SHADER_OPCODE_TXD, dst);
 855       break;
 856    case ir_txs:
 857       inst = emit(SHADER_OPCODE_TXS, dst);
 858       break;
 859    case ir_txf:
 860       inst = emit(SHADER_OPCODE_TXF, dst);
 861       break;
 862    }
 863    inst->base_mrf = base_mrf;
 864    inst->mlen = mlen;
 865    inst->header_present = true;
 866
 867    if (simd16) {
 868       for (int i = 0; i < 4; i++) {
 869          emit(MOV(orig_dst, dst));
 870          orig_dst.reg_offset++;
 871          dst.reg_offset += 2;
 872       }
 873    }
 874
 875    return inst;
 876 }
 877
 878 /* gen5's sampler has slots for u, v, r, array index, then optional
 879  * parameters like shadow comparitor or LOD bias.  If optional
 880  * parameters aren't present, those base slots are optional and don't
 881  * need to be included in the message.
 882  *
 883  * We don't fill in the unnecessary slots regardless, which may look
 884  * surprising in the disassembly.
 885  */
 886 fs_inst *
 887 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 888                               fs_reg shadow_c, fs_reg lod, fs_reg lod2)
 889 {
 890    int mlen = 0;
 891    int base_mrf = 2;
 892    int reg_width = dispatch_width / 8;
 893    bool header_present = false;
 894    const int vector_elements =
 895       ir->coordinate ? ir->coordinate->type->vector_elements : 0;
 896
 897    if (ir->offset != NULL && ir->op == ir_txf) {
 898       /* It appears that the ld instruction used for txf does its
 899        * address bounds check before adding in the offset.  To work
 900        * around this, just add the integer offset to the integer texel
 901        * coordinate, and don't put the offset in the header.
 902        */
 903       ir_constant *offset = ir->offset->as_constant();
 904       for (int i = 0; i < vector_elements; i++) {
 905          emit(ADD(fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
 906                   coordinate,
 907                   offset->value.i[i]));
 908          coordinate.reg_offset++;
 909       }
 910    } else {
 911       if (ir->offset) {
 912          /* The offsets set up by the ir_texture visitor are in the
 913           * m1 header, so we can't go headerless.
 914           */
 915          header_present = true;
 916          mlen++;
 917          base_mrf--;
 918       }
 919
 920       for (int i = 0; i < vector_elements; i++) {
 921          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
 922                   coordinate));
 923          coordinate.reg_offset++;
 924       }
 925    }
 926    mlen += vector_elements * reg_width;
 927
 928    if (ir->shadow_comparitor) {
 929       mlen = MAX2(mlen, header_present + 4 * reg_width);
 930
 931       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
 932       mlen += reg_width;
 933    }
 934
 935    fs_inst *inst = NULL;
 936    switch (ir->op) {
 937    case ir_tex:
 938       inst = emit(SHADER_OPCODE_TEX, dst);
 939       break;
 940    case ir_txb:
 941       mlen = MAX2(mlen, header_present + 4 * reg_width);
 942       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
 943       mlen += reg_width;
 944
 945       inst = emit(FS_OPCODE_TXB, dst);
 946       break;
 947    case ir_txl:
 948       mlen = MAX2(mlen, header_present + 4 * reg_width);
 949       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
 950       mlen += reg_width;
 951
 952       inst = emit(SHADER_OPCODE_TXL, dst);
 953       break;
 954    case ir_txd: {
 955       mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
 956
 957       /**
 958        *  P   =  u,    v,    r
 959        * dPdx = dudx, dvdx, drdx
 960        * dPdy = dudy, dvdy, drdy
 961        *
 962        * Load up these values:
 963        * - dudx   dudy   dvdx   dvdy   drdx   drdy
 964        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
 965        */
 966       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
 967          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
 968          lod.reg_offset++;
 969          mlen += reg_width;
 970
 971          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod2));
 972          lod2.reg_offset++;
 973          mlen += reg_width;
 974       }
 975
 976       inst = emit(SHADER_OPCODE_TXD, dst);
 977       break;
 978    }
 979    case ir_txs:
 980       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
 981       mlen += reg_width;
 982       inst = emit(SHADER_OPCODE_TXS, dst);
 983       break;
 984    case ir_txf:
 985       mlen = header_present + 4 * reg_width;
 986
 987       emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD),
 988                lod));
 989       inst = emit(SHADER_OPCODE_TXF, dst);
 990       break;
 991    }
 992    inst->base_mrf = base_mrf;
 993    inst->mlen = mlen;
 994    inst->header_present = header_present;
 995
 996    if (mlen > 11) {
 997       fail("Message length >11 disallowed by hardware\n");
 998    }
 999
1000    return inst;
1001 }
1002
1003 fs_inst *
1004 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1005                               fs_reg shadow_c, fs_reg lod, fs_reg lod2)
1006 {
1007    int mlen = 0;
1008    int base_mrf = 2;
1009    int reg_width = dispatch_width / 8;
1010    bool header_present = false;
1011    int offsets[3];
1012
1013    if (ir->offset && ir->op != ir_txf) {
1014       /* The offsets set up by the ir_texture visitor are in the
1015        * m1 header, so we can't go headerless.
1016        */
1017       header_present = true;
1018       mlen++;
1019       base_mrf--;
1020    }
1021
1022    if (ir->shadow_comparitor) {
1023       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1024       mlen += reg_width;
1025    }
1026
1027    /* Set up the LOD info */
1028    switch (ir->op) {
1029    case ir_tex:
1030       break;
1031    case ir_txb:
1032       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1033       mlen += reg_width;
1034       break;
1035    case ir_txl:
1036       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1037       mlen += reg_width;
1038       break;
1039    case ir_txd: {
1040       if (dispatch_width == 16)
1041          fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1042
1043       /* Load dPdx and the coordinate together:
1044        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1045        */
1046       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1047          emit(MOV(fs_reg(MRF, base_mrf + mlen), coordinate));
1048          coordinate.reg_offset++;
1049          mlen += reg_width;
1050
1051          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1052          lod.reg_offset++;
1053          mlen += reg_width;
1054
1055          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod2));
1056          lod2.reg_offset++;
1057          mlen += reg_width;
1058       }
1059       break;
1060    }
1061    case ir_txs:
1062       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1063       mlen += reg_width;
1064       break;
1065    case ir_txf:
1066       /* It appears that the ld instruction used for txf does its
1067        * address bounds check before adding in the offset.  To work
1068        * around this, just add the integer offset to the integer texel
1069        * coordinate, and don't put the offset in the header.
1070        */
1071       if (ir->offset) {
1072          ir_constant *offset = ir->offset->as_constant();
1073          offsets[0] = offset->value.i[0];
1074          offsets[1] = offset->value.i[1];
1075          offsets[2] = offset->value.i[2];
1076       } else {
1077          memset(offsets, 0, sizeof(offsets));
1078       }
1079
1080       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1081       emit(ADD(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D),
1082                coordinate, offsets[0]));
1083       coordinate.reg_offset++;
1084       mlen += reg_width;
1085
1086       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), lod));
1087       mlen += reg_width;
1088
1089       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1090          emit(ADD(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D),
1091                   coordinate, offsets[i]));
1092          coordinate.reg_offset++;
1093          mlen += reg_width;
1094       }
1095       break;
1096    }
1097
1098    /* Set up the coordinate (except for cases where it was done above) */
1099    if (ir->op != ir_txd && ir->op != ir_txs && ir->op != ir_txf) {
1100       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1101          emit(MOV(fs_reg(MRF, base_mrf + mlen), coordinate));
1102          coordinate.reg_offset++;
1103          mlen += reg_width;
1104       }
1105    }
1106
1107    /* Generate the SEND */
1108    fs_inst *inst = NULL;
1109    switch (ir->op) {
1110    case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
1111    case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1112    case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
1113    case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
1114    case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
1115    case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
1116    }
1117    inst->base_mrf = base_mrf;
1118    inst->mlen = mlen;
1119    inst->header_present = header_present;
1120
1121    if (mlen > 11) {
1122       fail("Message length >11 disallowed by hardware\n");
1123    }
1124
1125    return inst;
1126 }
1127
1128 fs_reg
1129 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
1130                              bool is_rect, int sampler, int texunit)
1131 {
1132    fs_inst *inst = NULL;
1133    bool needs_gl_clamp = true;
1134    fs_reg scale_x, scale_y;
1135
1136    /* The 965 requires the EU to do the normalization of GL rectangle
1137     * texture coordinates.  We use the program parameter state
1138     * tracking to get the scaling factor.
1139     */
1140    if (is_rect &&
1141        (intel->gen < 6 ||
1142         (intel->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1143                              c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1144       struct gl_program_parameter_list *params = fp->Base.Parameters;
1145       int tokens[STATE_LENGTH] = {
1146          STATE_INTERNAL,
1147          STATE_TEXRECT_SCALE,
1148          texunit,
1149          0,
1150          0
1151       };
1152
1153       if (dispatch_width == 16) {
1154          fail("rectangle scale uniform setup not supported on 16-wide\n");
1155          return coordinate;
1156       }
1157
1158       scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1159       scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1160
1161       GLuint index = _mesa_add_state_reference(params,
1162                                                (gl_state_index *)tokens);
1163
1164       this->param_index[c->prog_data.nr_params] = index;
1165       this->param_offset[c->prog_data.nr_params] = 0;
1166       c->prog_data.nr_params++;
1167       this->param_index[c->prog_data.nr_params] = index;
1168       this->param_offset[c->prog_data.nr_params] = 1;
1169       c->prog_data.nr_params++;
1170    }
1171
1172    /* The 965 requires the EU to do the normalization of GL rectangle
1173     * texture coordinates.  We use the program parameter state
1174     * tracking to get the scaling factor.
1175     */
1176    if (intel->gen < 6 && is_rect) {
1177       fs_reg dst = fs_reg(this, ir->coordinate->type);
1178       fs_reg src = coordinate;
1179       coordinate = dst;
1180
1181       emit(MUL(dst, src, scale_x));
1182       dst.reg_offset++;
1183       src.reg_offset++;
1184       emit(MUL(dst, src, scale_y));
1185    } else if (is_rect) {
1186       /* On gen6+, the sampler handles the rectangle coordinates
1187        * natively, without needing rescaling.  But that means we have
1188        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1189        * not [0, 1] like the default case below.
1190        */
1191       needs_gl_clamp = false;
1192
1193       for (int i = 0; i < 2; i++) {
1194          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1195             fs_reg chan = coordinate;
1196             chan.reg_offset += i;
1197
1198             inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1199             inst->conditional_mod = BRW_CONDITIONAL_G;
1200
1201             /* Our parameter comes in as 1.0/width or 1.0/height,
1202              * because that's what people normally want for doing
1203              * texture rectangle handling.  We need width or height
1204              * for clamping, but we don't care enough to make a new
1205              * parameter type, so just invert back.
1206              */
1207             fs_reg limit = fs_reg(this, glsl_type::float_type);
1208             emit(MOV(limit, i == 0 ? scale_x : scale_y));
1209             emit(SHADER_OPCODE_RCP, limit, limit);
1210
1211             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1212             inst->conditional_mod = BRW_CONDITIONAL_L;
1213          }
1214       }
1215    }
1216
1217    if (ir->coordinate && needs_gl_clamp) {
1218       for (unsigned int i = 0;
1219            i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1220          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1221             fs_reg chan = coordinate;
1222             chan.reg_offset += i;
1223
1224             fs_inst *inst = emit(MOV(chan, chan));
1225             inst->saturate = true;
1226          }
1227       }
1228    }
1229    return coordinate;
1230 }
1231
1232 void
1233 fs_visitor::visit(ir_texture *ir)
1234 {
1235    fs_inst *inst = NULL;
1236
1237    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &fp->Base);
1238    int texunit = fp->Base.SamplerUnits[sampler];
1239
1240    /* Should be lowered by do_lower_texture_projection */
1241    assert(!ir->projector);
1242
1243    /* Generate code to compute all the subexpression trees.  This has to be
1244     * done before loading any values into MRFs for the sampler message since
1245     * generating these values may involve SEND messages that need the MRFs.
1246     */
1247    fs_reg coordinate;
1248    if (ir->coordinate) {
1249       ir->coordinate->accept(this);
1250
1251       coordinate = rescale_texcoord(ir, this->result,
1252                                     ir->sampler->type->sampler_dimensionality ==
1253                                     GLSL_SAMPLER_DIM_RECT,
1254                                     sampler, texunit);
1255    }
1256
1257    fs_reg shadow_comparitor;
1258    if (ir->shadow_comparitor) {
1259       ir->shadow_comparitor->accept(this);
1260       shadow_comparitor = this->result;
1261    }
1262
1263    fs_reg lod, lod2;
1264    switch (ir->op) {
1265    case ir_tex:
1266       break;
1267    case ir_txb:
1268       ir->lod_info.bias->accept(this);
1269       lod = this->result;
1270       break;
1271    case ir_txd:
1272       ir->lod_info.grad.dPdx->accept(this);
1273       lod = this->result;
1274
1275       ir->lod_info.grad.dPdy->accept(this);
1276       lod2 = this->result;
1277       break;
1278    case ir_txf:
1279    case ir_txl:
1280    case ir_txs:
1281       ir->lod_info.lod->accept(this);
1282       lod = this->result;
1283       break;
1284    };
1285
1286    /* Writemasking doesn't eliminate channels on SIMD8 texture
1287     * samples, so don't worry about them.
1288     */
1289    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1290
1291    if (intel->gen >= 7) {
1292       inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1293                                lod, lod2);
1294    } else if (intel->gen >= 5) {
1295       inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1296                                lod, lod2);
1297    } else {
1298       inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1299                                lod, lod2);
1300    }
1301
1302    /* The header is set up by generate_tex() when necessary. */
1303    inst->src[0] = reg_undef;
1304
1305    if (ir->offset != NULL && ir->op != ir_txf)
1306       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1307
1308    inst->sampler = sampler;
1309
1310    if (ir->shadow_comparitor)
1311       inst->shadow_compare = true;
1312
1313    swizzle_result(ir, dst, sampler);
1314 }
1315
1316 /**
1317  * Swizzle the result of a texture result.  This is necessary for
1318  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1319  */
1320 void
1321 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1322 {
1323    this->result = orig_val;
1324
1325    if (ir->op == ir_txs)
1326       return;
1327
1328    if (ir->type == glsl_type::float_type) {
1329       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1330       assert(ir->sampler->type->sampler_shadow);
1331    } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1332       fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1333
1334       for (int i = 0; i < 4; i++) {
1335          int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1336          fs_reg l = swizzled_result;
1337          l.reg_offset += i;
1338
1339          if (swiz == SWIZZLE_ZERO) {
1340             emit(MOV(l, fs_reg(0.0f)));
1341          } else if (swiz == SWIZZLE_ONE) {
1342             emit(MOV(l, fs_reg(1.0f)));
1343          } else {
1344             fs_reg r = orig_val;
1345             r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1346             emit(MOV(l, r));
1347          }
1348       }
1349       this->result = swizzled_result;
1350    }
1351 }
1352
1353 void
1354 fs_visitor::visit(ir_swizzle *ir)
1355 {
1356    ir->val->accept(this);
1357    fs_reg val = this->result;
1358
1359    if (ir->type->vector_elements == 1) {
1360       this->result.reg_offset += ir->mask.x;
1361       return;
1362    }
1363
1364    fs_reg result = fs_reg(this, ir->type);
1365    this->result = result;
1366
1367    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1368       fs_reg channel = val;
1369       int swiz = 0;
1370
1371       switch (i) {
1372       case 0:
1373          swiz = ir->mask.x;
1374          break;
1375       case 1:
1376          swiz = ir->mask.y;
1377          break;
1378       case 2:
1379          swiz = ir->mask.z;
1380          break;
1381       case 3:
1382          swiz = ir->mask.w;
1383          break;
1384       }
1385
1386       channel.reg_offset += swiz;
1387       emit(MOV(result, channel));
1388       result.reg_offset++;
1389    }
1390 }
1391
1392 void
1393 fs_visitor::visit(ir_discard *ir)
1394 {
1395    assert(ir->condition == NULL); /* FINISHME */
1396
1397    emit(FS_OPCODE_DISCARD);
1398 }
1399
1400 void
1401 fs_visitor::visit(ir_constant *ir)
1402 {
1403    /* Set this->result to reg at the bottom of the function because some code
1404     * paths will cause this visitor to be applied to other fields.  This will
1405     * cause the value stored in this->result to be modified.
1406     *
1407     * Make reg constant so that it doesn't get accidentally modified along the
1408     * way.  Yes, I actually had this problem. :(
1409     */
1410    const fs_reg reg(this, ir->type);
1411    fs_reg dst_reg = reg;
1412
1413    if (ir->type->is_array()) {
1414       const unsigned size = type_size(ir->type->fields.array);
1415
1416       for (unsigned i = 0; i < ir->type->length; i++) {
1417          ir->array_elements[i]->accept(this);
1418          fs_reg src_reg = this->result;
1419
1420          dst_reg.type = src_reg.type;
1421          for (unsigned j = 0; j < size; j++) {
1422             emit(MOV(dst_reg, src_reg));
1423             src_reg.reg_offset++;
1424             dst_reg.reg_offset++;
1425          }
1426       }
1427    } else if (ir->type->is_record()) {
1428       foreach_list(node, &ir->components) {
1429          ir_constant *const field = (ir_constant *) node;
1430          const unsigned size = type_size(field->type);
1431
1432          field->accept(this);
1433          fs_reg src_reg = this->result;
1434
1435          dst_reg.type = src_reg.type;
1436          for (unsigned j = 0; j < size; j++) {
1437             emit(MOV(dst_reg, src_reg));
1438             src_reg.reg_offset++;
1439             dst_reg.reg_offset++;
1440          }
1441       }
1442    } else {
1443       const unsigned size = type_size(ir->type);
1444
1445       for (unsigned i = 0; i < size; i++) {
1446          switch (ir->type->base_type) {
1447          case GLSL_TYPE_FLOAT:
1448             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
1449             break;
1450          case GLSL_TYPE_UINT:
1451             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
1452             break;
1453          case GLSL_TYPE_INT:
1454             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
1455             break;
1456          case GLSL_TYPE_BOOL:
1457             emit(MOV(dst_reg, fs_reg((int)ir->value.b[i])));
1458             break;
1459          default:
1460             assert(!"Non-float/uint/int/bool constant");
1461          }
1462          dst_reg.reg_offset++;
1463       }
1464    }
1465
1466    this->result = reg;
1467 }
1468
1469 void
1470 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1471 {
1472    ir_expression *expr = ir->as_expression();
1473
1474    if (expr) {
1475       fs_reg op[2];
1476       fs_inst *inst;
1477
1478       assert(expr->get_num_operands() <= 2);
1479       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1480          assert(expr->operands[i]->type->is_scalar());
1481
1482          expr->operands[i]->accept(this);
1483          op[i] = this->result;
1484
1485          resolve_ud_negate(&op[i]);
1486       }
1487
1488       switch (expr->operation) {
1489       case ir_unop_logic_not:
1490          inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
1491          inst->conditional_mod = BRW_CONDITIONAL_Z;
1492          break;
1493
1494       case ir_binop_logic_xor:
1495       case ir_binop_logic_or:
1496       case ir_binop_logic_and:
1497          goto out;
1498
1499       case ir_unop_f2b:
1500          if (intel->gen >= 6) {
1501             emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
1502          } else {
1503             inst = emit(MOV(reg_null_f, op[0]));
1504             inst->conditional_mod = BRW_CONDITIONAL_NZ;
1505          }
1506          break;
1507
1508       case ir_unop_i2b:
1509          if (intel->gen >= 6) {
1510             emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1511          } else {
1512             inst = emit(MOV(reg_null_d, op[0]));
1513             inst->conditional_mod = BRW_CONDITIONAL_NZ;
1514          }
1515          break;
1516
1517       case ir_binop_greater:
1518       case ir_binop_gequal:
1519       case ir_binop_less:
1520       case ir_binop_lequal:
1521       case ir_binop_equal:
1522       case ir_binop_all_equal:
1523       case ir_binop_nequal:
1524       case ir_binop_any_nequal:
1525          resolve_bool_comparison(expr->operands[0], &op[0]);
1526          resolve_bool_comparison(expr->operands[1], &op[1]);
1527
1528          emit(CMP(reg_null_d, op[0], op[1],
1529                   brw_conditional_for_comparison(expr->operation)));
1530          break;
1531
1532       default:
1533          assert(!"not reached");
1534          fail("bad cond code\n");
1535          break;
1536       }
1537       return;
1538    }
1539
1540 out:
1541    ir->accept(this);
1542
1543    fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
1544    inst->conditional_mod = BRW_CONDITIONAL_NZ;
1545 }
1546
1547 /**
1548  * Emit a gen6 IF statement with the comparison folded into the IF
1549  * instruction.
1550  */
1551 void
1552 fs_visitor::emit_if_gen6(ir_if *ir)
1553 {
1554    ir_expression *expr = ir->condition->as_expression();
1555
1556    if (expr) {
1557       fs_reg op[2];
1558       fs_inst *inst;
1559       fs_reg temp;
1560
1561       assert(expr->get_num_operands() <= 2);
1562       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1563          assert(expr->operands[i]->type->is_scalar());
1564
1565          expr->operands[i]->accept(this);
1566          op[i] = this->result;
1567       }
1568
1569       switch (expr->operation) {
1570       case ir_unop_logic_not:
1571       case ir_binop_logic_xor:
1572       case ir_binop_logic_or:
1573       case ir_binop_logic_and:
1574          /* For operations on bool arguments, only the low bit of the bool is
1575           * valid, and the others are undefined.  Fall back to the condition
1576           * code path.
1577           */
1578          break;
1579
1580       case ir_unop_f2b:
1581          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1582          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1583          return;
1584
1585       case ir_unop_i2b:
1586          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1587          return;
1588
1589       case ir_binop_greater:
1590       case ir_binop_gequal:
1591       case ir_binop_less:
1592       case ir_binop_lequal:
1593       case ir_binop_equal:
1594       case ir_binop_all_equal:
1595       case ir_binop_nequal:
1596       case ir_binop_any_nequal:
1597          resolve_bool_comparison(expr->operands[0], &op[0]);
1598          resolve_bool_comparison(expr->operands[1], &op[1]);
1599
1600          emit(IF(op[0], op[1],
1601                  brw_conditional_for_comparison(expr->operation)));
1602          return;
1603       default:
1604          assert(!"not reached");
1605          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1606          fail("bad condition\n");
1607          return;
1608       }
1609    }
1610
1611    emit_bool_to_cond_code(ir->condition);
1612    fs_inst *inst = emit(BRW_OPCODE_IF);
1613    inst->predicate = BRW_PREDICATE_NORMAL;
1614 }
1615
1616 void
1617 fs_visitor::visit(ir_if *ir)
1618 {
1619    if (intel->gen < 6 && dispatch_width == 16) {
1620       fail("Can't support (non-uniform) control flow on 16-wide\n");
1621    }
1622
1623    /* Don't point the annotation at the if statement, because then it plus
1624     * the then and else blocks get printed.
1625     */
1626    this->base_ir = ir->condition;
1627
1628    if (intel->gen == 6) {
1629       emit_if_gen6(ir);
1630    } else {
1631       emit_bool_to_cond_code(ir->condition);
1632
1633       emit(IF(BRW_PREDICATE_NORMAL));
1634    }
1635
1636    foreach_list(node, &ir->then_instructions) {
1637       ir_instruction *ir = (ir_instruction *)node;
1638       this->base_ir = ir;
1639
1640       ir->accept(this);
1641    }
1642
1643    if (!ir->else_instructions.is_empty()) {
1644       emit(BRW_OPCODE_ELSE);
1645
1646       foreach_list(node, &ir->else_instructions) {
1647          ir_instruction *ir = (ir_instruction *)node;
1648          this->base_ir = ir;
1649
1650          ir->accept(this);
1651       }
1652    }
1653
1654    emit(BRW_OPCODE_ENDIF);
1655 }
1656
1657 void
1658 fs_visitor::visit(ir_loop *ir)
1659 {
1660    fs_reg counter = reg_undef;
1661
1662    if (intel->gen < 6 && dispatch_width == 16) {
1663       fail("Can't support (non-uniform) control flow on 16-wide\n");
1664    }
1665
1666    if (ir->counter) {
1667       this->base_ir = ir->counter;
1668       ir->counter->accept(this);
1669       counter = *(variable_storage(ir->counter));
1670
1671       if (ir->from) {
1672          this->base_ir = ir->from;
1673          ir->from->accept(this);
1674
1675          emit(MOV(counter, this->result));
1676       }
1677    }
1678
1679    this->base_ir = NULL;
1680    emit(BRW_OPCODE_DO);
1681
1682    if (ir->to) {
1683       this->base_ir = ir->to;
1684       ir->to->accept(this);
1685
1686       emit(CMP(reg_null_d, counter, this->result,
1687                brw_conditional_for_comparison(ir->cmp)));
1688
1689       fs_inst *inst = emit(BRW_OPCODE_BREAK);
1690       inst->predicate = BRW_PREDICATE_NORMAL;
1691    }
1692
1693    foreach_list(node, &ir->body_instructions) {
1694       ir_instruction *ir = (ir_instruction *)node;
1695
1696       this->base_ir = ir;
1697       ir->accept(this);
1698    }
1699
1700    if (ir->increment) {
1701       this->base_ir = ir->increment;
1702       ir->increment->accept(this);
1703       emit(ADD(counter, counter, this->result));
1704    }
1705
1706    this->base_ir = NULL;
1707    emit(BRW_OPCODE_WHILE);
1708 }
1709
1710 void
1711 fs_visitor::visit(ir_loop_jump *ir)
1712 {
1713    switch (ir->mode) {
1714    case ir_loop_jump::jump_break:
1715       emit(BRW_OPCODE_BREAK);
1716       break;
1717    case ir_loop_jump::jump_continue:
1718       emit(BRW_OPCODE_CONTINUE);
1719       break;
1720    }
1721 }
1722
1723 void
1724 fs_visitor::visit(ir_call *ir)
1725 {
1726    assert(!"FINISHME");
1727 }
1728
1729 void
1730 fs_visitor::visit(ir_return *ir)
1731 {
1732    assert(!"FINISHME");
1733 }
1734
1735 void
1736 fs_visitor::visit(ir_function *ir)
1737 {
1738    /* Ignore function bodies other than main() -- we shouldn't see calls to
1739     * them since they should all be inlined before we get to ir_to_mesa.
1740     */
1741    if (strcmp(ir->name, "main") == 0) {
1742       const ir_function_signature *sig;
1743       exec_list empty;
1744
1745       sig = ir->matching_signature(&empty);
1746
1747       assert(sig);
1748
1749       foreach_list(node, &sig->body) {
1750          ir_instruction *ir = (ir_instruction *)node;
1751          this->base_ir = ir;
1752
1753          ir->accept(this);
1754       }
1755    }
1756 }
1757
1758 void
1759 fs_visitor::visit(ir_function_signature *ir)
1760 {
1761    assert(!"not reached");
1762    (void)ir;
1763 }
1764
1765 fs_inst *
1766 fs_visitor::emit(fs_inst inst)
1767 {
1768    fs_inst *list_inst = new(mem_ctx) fs_inst;
1769    *list_inst = inst;
1770    emit(list_inst);
1771    return list_inst;
1772 }
1773
1774 fs_inst *
1775 fs_visitor::emit(fs_inst *inst)
1776 {
1777    if (force_uncompressed_stack > 0)
1778       inst->force_uncompressed = true;
1779    else if (force_sechalf_stack > 0)
1780       inst->force_sechalf = true;
1781
1782    inst->annotation = this->current_annotation;
1783    inst->ir = this->base_ir;
1784
1785    this->instructions.push_tail(inst);
1786
1787    return inst;
1788 }
1789
1790 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1791 void
1792 fs_visitor::emit_dummy_fs()
1793 {
1794    int reg_width = dispatch_width / 8;
1795
1796    /* Everyone's favorite color. */
1797    emit(MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f)));
1798    emit(MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f)));
1799    emit(MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f)));
1800    emit(MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f)));
1801
1802    fs_inst *write;
1803    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1804    write->base_mrf = 2;
1805    write->mlen = 4 * reg_width;
1806    write->eot = true;
1807 }
1808
1809 /* The register location here is relative to the start of the URB
1810  * data.  It will get adjusted to be a real location before
1811  * generate_code() time.
1812  */
1813 struct brw_reg
1814 fs_visitor::interp_reg(int location, int channel)
1815 {
1816    int regnr = urb_setup[location] * 2 + channel / 2;
1817    int stride = (channel & 1) * 4;
1818
1819    assert(urb_setup[location] != -1);
1820
1821    return brw_vec1_grf(regnr, stride);
1822 }
1823
1824 /** Emits the interpolation for the varying inputs. */
1825 void
1826 fs_visitor::emit_interpolation_setup_gen4()
1827 {
1828    this->current_annotation = "compute pixel centers";
1829    this->pixel_x = fs_reg(this, glsl_type::uint_type);
1830    this->pixel_y = fs_reg(this, glsl_type::uint_type);
1831    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1832    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1833
1834    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
1835    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
1836
1837    this->current_annotation = "compute pixel deltas from v0";
1838    if (brw->has_pln) {
1839       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1840          fs_reg(this, glsl_type::vec2_type);
1841       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1842          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
1843       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
1844    } else {
1845       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1846          fs_reg(this, glsl_type::float_type);
1847       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1848          fs_reg(this, glsl_type::float_type);
1849    }
1850    emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1851             this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
1852    emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1853             this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
1854
1855    this->current_annotation = "compute pos.w and 1/pos.w";
1856    /* Compute wpos.w.  It's always in our setup, since it's needed to
1857     * interpolate the other attributes.
1858     */
1859    this->wpos_w = fs_reg(this, glsl_type::float_type);
1860    emit(FS_OPCODE_LINTERP, wpos_w,
1861         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1862         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1863         interp_reg(FRAG_ATTRIB_WPOS, 3));
1864    /* Compute the pixel 1/W value from wpos.w. */
1865    this->pixel_w = fs_reg(this, glsl_type::float_type);
1866    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
1867    this->current_annotation = NULL;
1868 }
1869
1870 /** Emits the interpolation for the varying inputs. */
1871 void
1872 fs_visitor::emit_interpolation_setup_gen6()
1873 {
1874    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1875
1876    /* If the pixel centers end up used, the setup is the same as for gen4. */
1877    this->current_annotation = "compute pixel centers";
1878    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1879    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1880    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1881    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1882    emit(ADD(int_pixel_x,
1883             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1884             fs_reg(brw_imm_v(0x10101010))));
1885    emit(ADD(int_pixel_y,
1886             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1887             fs_reg(brw_imm_v(0x11001100))));
1888
1889    /* As of gen6, we can no longer mix float and int sources.  We have
1890     * to turn the integer pixel centers into floats for their actual
1891     * use.
1892     */
1893    this->pixel_x = fs_reg(this, glsl_type::float_type);
1894    this->pixel_y = fs_reg(this, glsl_type::float_type);
1895    emit(MOV(this->pixel_x, int_pixel_x));
1896    emit(MOV(this->pixel_y, int_pixel_y));
1897
1898    this->current_annotation = "compute pos.w";
1899    this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1900    this->wpos_w = fs_reg(this, glsl_type::float_type);
1901    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
1902
1903    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1904       uint8_t reg = c->barycentric_coord_reg[i];
1905       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
1906       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
1907    }
1908
1909    this->current_annotation = NULL;
1910 }
1911
1912 void
1913 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
1914 {
1915    int reg_width = dispatch_width / 8;
1916    fs_inst *inst;
1917    fs_reg color = outputs[target];
1918    fs_reg mrf;
1919
1920    /* If there's no color data to be written, skip it. */
1921    if (color.file == BAD_FILE)
1922       return;
1923
1924    color.reg_offset += index;
1925
1926    if (dispatch_width == 8 || intel->gen >= 6) {
1927       /* SIMD8 write looks like:
1928        * m + 0: r0
1929        * m + 1: r1
1930        * m + 2: g0
1931        * m + 3: g1
1932        *
1933        * gen6 SIMD16 DP write looks like:
1934        * m + 0: r0
1935        * m + 1: r1
1936        * m + 2: g0
1937        * m + 3: g1
1938        * m + 4: b0
1939        * m + 5: b1
1940        * m + 6: a0
1941        * m + 7: a1
1942        */
1943       inst = emit(MOV(fs_reg(MRF, first_color_mrf + index * reg_width,
1944                              color.type),
1945                       color));
1946       inst->saturate = c->key.clamp_fragment_color;
1947    } else {
1948       /* pre-gen6 SIMD16 single source DP write looks like:
1949        * m + 0: r0
1950        * m + 1: g0
1951        * m + 2: b0
1952        * m + 3: a0
1953        * m + 4: r1
1954        * m + 5: g1
1955        * m + 6: b1
1956        * m + 7: a1
1957        */
1958       if (brw->has_compr4) {
1959          /* By setting the high bit of the MRF register number, we
1960           * indicate that we want COMPR4 mode - instead of doing the
1961           * usual destination + 1 for the second half we get
1962           * destination + 4.
1963           */
1964          inst = emit(MOV(fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
1965                                 color.type),
1966                          color));
1967          inst->saturate = c->key.clamp_fragment_color;
1968       } else {
1969          push_force_uncompressed();
1970          inst = emit(MOV(fs_reg(MRF, first_color_mrf + index, color.type),
1971                          color));
1972          inst->saturate = c->key.clamp_fragment_color;
1973          pop_force_uncompressed();
1974
1975          push_force_sechalf();
1976          color.sechalf = true;
1977          inst = emit(MOV(fs_reg(MRF, first_color_mrf + index + 4, color.type),
1978                          color));
1979          inst->saturate = c->key.clamp_fragment_color;
1980          pop_force_sechalf();
1981          color.sechalf = false;
1982       }
1983    }
1984 }
1985
1986 void
1987 fs_visitor::emit_fb_writes()
1988 {
1989    this->current_annotation = "FB write header";
1990    bool header_present = true;
1991    /* We can potentially have a message length of up to 15, so we have to set
1992     * base_mrf to either 0 or 1 in order to fit in m0..m15.
1993     */
1994    int base_mrf = 1;
1995    int nr = base_mrf;
1996    int reg_width = dispatch_width / 8;
1997    bool do_dual_src = this->dual_src_output.file != BAD_FILE;
1998    bool src0_alpha_to_render_target = false;
1999
2000    if (dispatch_width == 16 && do_dual_src) {
2001       fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
2002       do_dual_src = false;
2003    }
2004
2005    /* From the Sandy Bridge PRM, volume 4, page 198:
2006     *
2007     *     "Dispatched Pixel Enables. One bit per pixel indicating
2008     *      which pixels were originally enabled when the thread was
2009     *      dispatched. This field is only required for the end-of-
2010     *      thread message and on all dual-source messages."
2011     */
2012    if (intel->gen >= 6 &&
2013        !this->fp->UsesKill &&
2014        !do_dual_src &&
2015        c->key.nr_color_regions == 1) {
2016       header_present = false;
2017    }
2018
2019    if (header_present) {
2020       src0_alpha_to_render_target = intel->gen >= 6 &&
2021                                     !do_dual_src &&
2022                                     c->key.nr_color_regions > 1 &&
2023                                     c->key.sample_alpha_to_coverage;
2024       /* m2, m3 header */
2025       nr += 2;
2026    }
2027
2028    if (c->aa_dest_stencil_reg) {
2029       push_force_uncompressed();
2030       emit(MOV(fs_reg(MRF, nr++),
2031                fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2032       pop_force_uncompressed();
2033    }
2034
2035    /* Reserve space for color. It'll be filled in per MRT below. */
2036    int color_mrf = nr;
2037    nr += 4 * reg_width;
2038    if (do_dual_src)
2039       nr += 4;
2040    if (src0_alpha_to_render_target)
2041       nr += reg_width;
2042
2043    if (c->source_depth_to_render_target) {
2044       if (intel->gen == 6 && dispatch_width == 16) {
2045          /* For outputting oDepth on gen6, SIMD8 writes have to be
2046           * used.  This would require 8-wide moves of each half to
2047           * message regs, kind of like pre-gen5 SIMD16 FB writes.
2048           * Just bail on doing so for now.
2049           */
2050          fail("Missing support for simd16 depth writes on gen6\n");
2051       }
2052
2053       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2054          /* Hand over gl_FragDepth. */
2055          assert(this->frag_depth.file != BAD_FILE);
2056          emit(MOV(fs_reg(MRF, nr), this->frag_depth));
2057       } else {
2058          /* Pass through the payload depth. */
2059          emit(MOV(fs_reg(MRF, nr),
2060                   fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2061       }
2062       nr += reg_width;
2063    }
2064
2065    if (c->dest_depth_reg) {
2066       emit(MOV(fs_reg(MRF, nr),
2067                fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2068       nr += reg_width;
2069    }
2070
2071    if (do_dual_src) {
2072       fs_reg src0 = this->outputs[0];
2073       fs_reg src1 = this->dual_src_output;
2074
2075       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2076                                                  "FB write src0");
2077       for (int i = 0; i < 4; i++) {
2078          fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + i, src0.type), src0));
2079          src0.reg_offset++;
2080          inst->saturate = c->key.clamp_fragment_color;
2081       }
2082
2083       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2084                                                  "FB write src1");
2085       for (int i = 0; i < 4; i++) {
2086          fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + 4 + i, src1.type),
2087                                   src1));
2088          src1.reg_offset++;
2089          inst->saturate = c->key.clamp_fragment_color;
2090       }
2091
2092       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2093       inst->target = 0;
2094       inst->base_mrf = base_mrf;
2095       inst->mlen = nr - base_mrf;
2096       inst->eot = true;
2097       inst->header_present = header_present;
2098
2099       c->prog_data.dual_src_blend = true;
2100       this->current_annotation = NULL;
2101       return;
2102    }
2103
2104    for (int target = 0; target < c->key.nr_color_regions; target++) {
2105       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2106                                                  "FB write target %d",
2107                                                  target);
2108       /* If src0_alpha_to_render_target is true, include source zero alpha
2109        * data in RenderTargetWrite message for targets > 0.
2110        */
2111       int write_color_mrf = color_mrf;
2112       if (src0_alpha_to_render_target && target != 0) {
2113          fs_inst *inst;
2114          fs_reg color = outputs[0];
2115          color.reg_offset += 3;
2116
2117          inst = emit(MOV(fs_reg(MRF, write_color_mrf, color.type),
2118                          color));
2119          inst->saturate = c->key.clamp_fragment_color;
2120          write_color_mrf = color_mrf + reg_width;
2121       }
2122
2123       for (unsigned i = 0; i < this->output_components[target]; i++)
2124          emit_color_write(target, i, write_color_mrf);
2125
2126       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2127       inst->target = target;
2128       inst->base_mrf = base_mrf;
2129       if (src0_alpha_to_render_target && target == 0)
2130          inst->mlen = nr - base_mrf - reg_width;
2131       else
2132          inst->mlen = nr - base_mrf;
2133       if (target == c->key.nr_color_regions - 1)
2134          inst->eot = true;
2135       inst->header_present = header_present;
2136    }
2137
2138    if (c->key.nr_color_regions == 0) {
2139       /* Even if there's no color buffers enabled, we still need to send
2140        * alpha out the pipeline to our null renderbuffer to support
2141        * alpha-testing, alpha-to-coverage, and so on.
2142        */
2143       emit_color_write(0, 3, color_mrf);
2144
2145       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2146       inst->base_mrf = base_mrf;
2147       inst->mlen = nr - base_mrf;
2148       inst->eot = true;
2149       inst->header_present = header_present;
2150    }
2151
2152    this->current_annotation = NULL;
2153 }
2154
2155 void
2156 fs_visitor::resolve_ud_negate(fs_reg *reg)
2157 {
2158    if (reg->type != BRW_REGISTER_TYPE_UD ||
2159        !reg->negate)
2160       return;
2161
2162    fs_reg temp = fs_reg(this, glsl_type::uint_type);
2163    emit(MOV(temp, *reg));
2164    *reg = temp;
2165 }
2166
2167 void
2168 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2169 {
2170    if (rvalue->type != glsl_type::bool_type)
2171       return;
2172
2173    fs_reg temp = fs_reg(this, glsl_type::bool_type);
2174    emit(AND(temp, *reg, fs_reg(1)));
2175    *reg = temp;
2176 }
2177
2178 fs_visitor::fs_visitor(struct brw_context *brw,
2179                        struct brw_wm_compile *c,
2180                        struct gl_shader_program *prog,
2181                        struct gl_fragment_program *fp,
2182                        unsigned dispatch_width)
2183    : dispatch_width(dispatch_width)
2184 {
2185    this->c = c;
2186    this->brw = brw;
2187    this->fp = fp;
2188    this->prog = prog;
2189    this->intel = &brw->intel;
2190    this->ctx = &intel->ctx;
2191    this->mem_ctx = ralloc_context(NULL);
2192    if (prog)
2193       shader = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2194    else
2195       shader = NULL;
2196    this->failed = false;
2197    this->variable_ht = hash_table_ctor(0,
2198                                        hash_table_pointer_hash,
2199                                        hash_table_pointer_compare);
2200
2201    memset(this->outputs, 0, sizeof(this->outputs));
2202    memset(this->output_components, 0, sizeof(this->output_components));
2203    this->first_non_payload_grf = 0;
2204    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2205
2206    this->current_annotation = NULL;
2207    this->base_ir = NULL;
2208
2209    this->virtual_grf_sizes = NULL;
2210    this->virtual_grf_count = 0;
2211    this->virtual_grf_array_size = 0;
2212    this->virtual_grf_def = NULL;
2213    this->virtual_grf_use = NULL;
2214    this->live_intervals_valid = false;
2215
2216    this->force_uncompressed_stack = 0;
2217    this->force_sechalf_stack = 0;
2218 }
2219
2220 fs_visitor::~fs_visitor()
2221 {
2222    ralloc_free(this->mem_ctx);
2223    hash_table_dtor(this->variable_ht);
2224 }