src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "program/prog_parameter.h"
  37 #include "program/prog_print.h"
  38 #include "program/prog_optimize.h"
  39 #include "program/register_allocate.h"
  40 #include "program/sampler.h"
  41 #include "program/hash_table.h"
  42 #include "brw_context.h"
  43 #include "brw_eu.h"
  44 #include "brw_wm.h"
  45 }
  46 #include "brw_fs.h"
  47 #include "main/uniforms.h"
  48 #include "glsl/glsl_types.h"
  49 #include "glsl/ir_optimization.h"
  50
  51 void
  52 fs_visitor::visit(ir_variable *ir)
  53 {
  54    fs_reg *reg = NULL;
  55
  56    if (variable_storage(ir))
  57       return;
  58
  59    if (ir->data.mode == ir_var_shader_in) {
  60       if (!strcmp(ir->name, "gl_FragCoord")) {
  61          reg = emit_fragcoord_interpolation(ir);
  62       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
  63          reg = emit_frontfacing_interpolation(ir);
  64       } else {
  65          reg = emit_general_interpolation(ir);
  66       }
  67       assert(reg);
  68       hash_table_insert(this->variable_ht, reg, ir);
  69       return;
  70    } else if (ir->data.mode == ir_var_shader_out) {
  71       reg = new(this->mem_ctx) fs_reg(this, ir->type);
  72
  73       if (ir->data.index > 0) {
  74          assert(ir->data.location == FRAG_RESULT_DATA0);
  75          assert(ir->data.index == 1);
  76          this->dual_src_output = *reg;
  77          this->do_dual_src = true;
  78       } else if (ir->data.location == FRAG_RESULT_COLOR) {
  79          /* Writing gl_FragColor outputs to all color regions. */
  80          for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
  81             this->outputs[i] = *reg;
  82             this->output_components[i] = 4;
  83          }
  84       } else if (ir->data.location == FRAG_RESULT_DEPTH) {
  85          this->frag_depth = *reg;
  86       } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
  87          this->sample_mask = *reg;
  88       } else {
  89          /* gl_FragData or a user-defined FS output */
  90          assert(ir->data.location >= FRAG_RESULT_DATA0 &&
  91                 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
  92
  93          int vector_elements =
  94             ir->type->is_array() ? ir->type->fields.array->vector_elements
  95                                  : ir->type->vector_elements;
  96
  97          /* General color output. */
  98          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
  99             int output = ir->data.location - FRAG_RESULT_DATA0 + i;
 100             this->outputs[output] = *reg;
 101             this->outputs[output].reg_offset += vector_elements * i;
 102             this->output_components[output] = vector_elements;
 103          }
 104       }
 105    } else if (ir->data.mode == ir_var_uniform) {
 106       int param_index = uniforms;
 107
 108       /* Thanks to the lower_ubo_reference pass, we will see only
 109        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 110        * variables, so no need for them to be in variable_ht.
 111        *
 112        * Atomic counters take no uniform storage, no need to do
 113        * anything here.
 114        */
 115       if (ir->is_in_uniform_block() || ir->type->contains_atomic())
 116          return;
 117
 118       if (dispatch_width == 16) {
 119          if (!variable_storage(ir)) {
 120             fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
 121          }
 122          return;
 123       }
 124
 125       param_size[param_index] = type_size(ir->type);
 126       if (!strncmp(ir->name, "gl_", 3)) {
 127          setup_builtin_uniform_values(ir);
 128       } else {
 129          setup_uniform_values(ir);
 130       }
 131
 132       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 133       reg->type = brw_type_for_base_type(ir->type);
 134
 135    } else if (ir->data.mode == ir_var_system_value) {
 136       if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
 137          reg = emit_samplepos_setup(ir);
 138       } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) {
 139          reg = emit_sampleid_setup(ir);
 140       } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) {
 141          assert(brw->gen >= 7);
 142          reg = new(mem_ctx)
 143             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
 144                           BRW_REGISTER_TYPE_D));
 145       }
 146    }
 147
 148    if (!reg)
 149       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 150
 151    hash_table_insert(this->variable_ht, reg, ir);
 152 }
 153
 154 void
 155 fs_visitor::visit(ir_dereference_variable *ir)
 156 {
 157    fs_reg *reg = variable_storage(ir->var);
 158
 159    if (!reg) {
 160       fail("Failed to find variable storage for %s\n", ir->var->name);
 161       this->result = fs_reg(reg_null_d);
 162       return;
 163    }
 164    this->result = *reg;
 165 }
 166
 167 void
 168 fs_visitor::visit(ir_dereference_record *ir)
 169 {
 170    const glsl_type *struct_type = ir->record->type;
 171
 172    ir->record->accept(this);
 173
 174    unsigned int offset = 0;
 175    for (unsigned int i = 0; i < struct_type->length; i++) {
 176       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 177          break;
 178       offset += type_size(struct_type->fields.structure[i].type);
 179    }
 180    this->result.reg_offset += offset;
 181    this->result.type = brw_type_for_base_type(ir->type);
 182 }
 183
 184 void
 185 fs_visitor::visit(ir_dereference_array *ir)
 186 {
 187    ir_constant *constant_index;
 188    fs_reg src;
 189    int element_size = type_size(ir->type);
 190
 191    constant_index = ir->array_index->as_constant();
 192
 193    ir->array->accept(this);
 194    src = this->result;
 195    src.type = brw_type_for_base_type(ir->type);
 196
 197    if (constant_index) {
 198       assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
 199       src.reg_offset += constant_index->value.i[0] * element_size;
 200    } else {
 201       /* Variable index array dereference.  We attach the variable index
 202        * component to the reg as a pointer to a register containing the
 203        * offset.  Currently only uniform arrays are supported in this patch,
 204        * and that reladdr pointer is resolved by
 205        * move_uniform_array_access_to_pull_constants().  All other array types
 206        * are lowered by lower_variable_index_to_cond_assign().
 207        */
 208       ir->array_index->accept(this);
 209
 210       fs_reg index_reg;
 211       index_reg = fs_reg(this, glsl_type::int_type);
 212       emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
 213
 214       if (src.reladdr) {
 215          emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
 216       }
 217
 218       src.reladdr = ralloc(mem_ctx, fs_reg);
 219       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
 220    }
 221    this->result = src;
 222 }
 223
 224 void
 225 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
 226                      const fs_reg &a)
 227 {
 228    if (brw->gen < 6 ||
 229        !x.is_valid_3src() ||
 230        !y.is_valid_3src() ||
 231        !a.is_valid_3src()) {
 232       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 233       fs_reg y_times_a           = fs_reg(this, glsl_type::float_type);
 234       fs_reg one_minus_a         = fs_reg(this, glsl_type::float_type);
 235       fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
 236
 237       emit(MUL(y_times_a, y, a));
 238
 239       fs_reg negative_a = a;
 240       negative_a.negate = !a.negate;
 241       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
 242       emit(MUL(x_times_one_minus_a, x, one_minus_a));
 243
 244       emit(ADD(dst, x_times_one_minus_a, y_times_a));
 245    } else {
 246       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 247        * we need to reorder the operands.
 248        */
 249       emit(LRP(dst, a, y, x));
 250    }
 251 }
 252
 253 void
 254 fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
 255                         const fs_reg &src0, const fs_reg &src1)
 256 {
 257    fs_inst *inst;
 258
 259    if (brw->gen >= 6) {
 260       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 261       inst->conditional_mod = conditionalmod;
 262    } else {
 263       emit(CMP(reg_null_d, src0, src1, conditionalmod));
 264
 265       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 266       inst->predicate = BRW_PREDICATE_NORMAL;
 267    }
 268 }
 269
 270 /* Instruction selection: Produce a MOV.sat instead of
 271  * MIN(MAX(val, 0), 1) when possible.
 272  */
 273 bool
 274 fs_visitor::try_emit_saturate(ir_expression *ir)
 275 {
 276    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 277
 278    if (!sat_val)
 279       return false;
 280
 281    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 282
 283    sat_val->accept(this);
 284    fs_reg src = this->result;
 285
 286    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 287
 288    /* If the last instruction from our accept() didn't generate our
 289     * src, generate a saturated MOV
 290     */
 291    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 292    if (!modify || modify->regs_written != 1) {
 293       this->result = fs_reg(this, ir->type);
 294       fs_inst *inst = emit(MOV(this->result, src));
 295       inst->saturate = true;
 296    } else {
 297       modify->saturate = true;
 298       this->result = src;
 299    }
 300
 301
 302    return true;
 303 }
 304
 305 bool
 306 fs_visitor::try_emit_mad(ir_expression *ir)
 307 {
 308    /* 3-src instructions were introduced in gen6. */
 309    if (brw->gen < 6)
 310       return false;
 311
 312    /* MAD can only handle floating-point data. */
 313    if (ir->type != glsl_type::float_type)
 314       return false;
 315
 316    ir_rvalue *nonmul = ir->operands[1];
 317    ir_expression *mul = ir->operands[0]->as_expression();
 318
 319    if (!mul || mul->operation != ir_binop_mul) {
 320       nonmul = ir->operands[0];
 321       mul = ir->operands[1]->as_expression();
 322
 323       if (!mul || mul->operation != ir_binop_mul)
 324          return false;
 325    }
 326
 327    if (nonmul->as_constant() ||
 328        mul->operands[0]->as_constant() ||
 329        mul->operands[1]->as_constant())
 330       return false;
 331
 332    nonmul->accept(this);
 333    fs_reg src0 = this->result;
 334
 335    mul->operands[0]->accept(this);
 336    fs_reg src1 = this->result;
 337
 338    mul->operands[1]->accept(this);
 339    fs_reg src2 = this->result;
 340
 341    this->result = fs_reg(this, ir->type);
 342    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 343
 344    return true;
 345 }
 346
 347 static int
 348 pack_pixel_offset(float x)
 349 {
 350    /* Clamp upper end of the range to +7/16. See explanation in non-constant
 351     * offset case below. */
 352    int n = MIN2((int)(x * 16), 7);
 353    return n & 0xf;
 354 }
 355
 356 void
 357 fs_visitor::emit_interpolate_expression(ir_expression *ir)
 358 {
 359    /* in SIMD16 mode, the pixel interpolator returns coords interleaved
 360     * 8 channels at a time, same as the barycentric coords presented in
 361     * the FS payload. this requires a bit of extra work to support.
 362     */
 363    no16("interpolate_at_* not yet supported in SIMD16 mode.");
 364
 365    ir_dereference * deref = ir->operands[0]->as_dereference();
 366    ir_swizzle * swiz = NULL;
 367    if (!deref) {
 368       /* the api does not allow a swizzle here, but the varying packing code
 369        * may have pushed one into here.
 370        */
 371       swiz = ir->operands[0]->as_swizzle();
 372       assert(swiz);
 373       deref = swiz->val->as_dereference();
 374    }
 375    assert(deref);
 376    ir_variable * var = deref->variable_referenced();
 377    assert(var);
 378
 379    /* 1. collect interpolation factors */
 380
 381    fs_reg dst_x = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 2, 1));
 382    fs_reg dst_y = dst_x;
 383    dst_y.reg_offset++;
 384
 385    /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
 386     * even when there is no payload. in the per-slot offset case, we'll replace this with
 387     * the proper source data. */
 388    fs_reg src = fs_reg(this, glsl_type::float_type);
 389    int mlen = 1;     /* one reg unless overriden */
 390    int reg_width = dispatch_width / 8;
 391    fs_inst *inst;
 392
 393    switch (ir->operation) {
 394    case ir_unop_interpolate_at_centroid:
 395       inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
 396       break;
 397
 398    case ir_binop_interpolate_at_sample: {
 399       ir_constant *sample_num = ir->operands[1]->as_constant();
 400       assert(sample_num || !"nonconstant sample number should have been lowered.");
 401
 402       unsigned msg_data = sample_num->value.i[0] << 4;
 403       inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
 404       break;
 405    }
 406
 407    case ir_binop_interpolate_at_offset: {
 408       ir_constant *const_offset = ir->operands[1]->as_constant();
 409       if (const_offset) {
 410          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
 411                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
 412          inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
 413                      fs_reg(msg_data));
 414       } else {
 415          /* pack the operands: hw wants offsets as 4 bit signed ints */
 416          ir->operands[1]->accept(this);
 417          src = fs_reg(this, glsl_type::ivec2_type);
 418          fs_reg src2 = src;
 419          for (int i = 0; i < 2; i++) {
 420             fs_reg temp = fs_reg(this, glsl_type::float_type);
 421             emit(MUL(temp, this->result, fs_reg(16.0f)));
 422             emit(MOV(src2, temp));  /* float to int */
 423
 424             /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
 425              * that we support a maximum offset of +0.5, which isn't representable
 426              * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
 427              * which is the opposite of what the shader author wanted.
 428              *
 429              * This is legal due to ARB_gpu_shader5's quantization rules:
 430              *
 431              * "Not all values of <offset> may be supported; x and y offsets may
 432              * be rounded to fixed-point values with the number of fraction bits
 433              * given by the implementation-dependent constant
 434              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
 435              */
 436
 437             fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
 438             inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
 439
 440             src2.reg_offset++;
 441             this->result.reg_offset++;
 442          }
 443
 444          mlen = 2 * reg_width;
 445          inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
 446                      fs_reg(0u));
 447       }
 448       break;
 449    }
 450
 451    default:
 452       unreachable("not reached");
 453    }
 454
 455    inst->mlen = mlen;
 456    inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
 457    inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
 458          INTERP_QUALIFIER_NOPERSPECTIVE;
 459
 460    /* 2. emit linterp */
 461
 462    fs_reg res(this, ir->type);
 463    this->result = res;
 464
 465    for (int i = 0; i < ir->type->vector_elements; i++) {
 466       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
 467       emit(FS_OPCODE_LINTERP, res,
 468            dst_x, dst_y,
 469            fs_reg(interp_reg(var->data.location, ch)));
 470       res.reg_offset++;
 471    }
 472 }
 473
 474 void
 475 fs_visitor::visit(ir_expression *ir)
 476 {
 477    unsigned int operand;
 478    fs_reg op[3], temp;
 479    fs_inst *inst;
 480
 481    assert(ir->get_num_operands() <= 3);
 482
 483    if (try_emit_saturate(ir))
 484       return;
 485
 486    /* Deal with the real oddball stuff first */
 487    switch (ir->operation) {
 488    case ir_binop_add:
 489       if (try_emit_mad(ir))
 490          return;
 491       break;
 492
 493    case ir_unop_interpolate_at_centroid:
 494    case ir_binop_interpolate_at_offset:
 495    case ir_binop_interpolate_at_sample:
 496       emit_interpolate_expression(ir);
 497       return;
 498
 499    default:
 500       break;
 501    }
 502
 503    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 504       ir->operands[operand]->accept(this);
 505       if (this->result.file == BAD_FILE) {
 506          fail("Failed to get tree for expression operand:\n");
 507          ir->operands[operand]->fprint(stderr);
 508          fprintf(stderr, "\n");
 509       }
 510       assert(this->result.is_valid_3src());
 511       op[operand] = this->result;
 512
 513       /* Matrix expression operands should have been broken down to vector
 514        * operations already.
 515        */
 516       assert(!ir->operands[operand]->type->is_matrix());
 517       /* And then those vector operands should have been broken down to scalar.
 518        */
 519       assert(!ir->operands[operand]->type->is_vector());
 520    }
 521
 522    /* Storage for our result.  If our result goes into an assignment, it will
 523     * just get copy-propagated out, so no worries.
 524     */
 525    this->result = fs_reg(this, ir->type);
 526
 527    switch (ir->operation) {
 528    case ir_unop_logic_not:
 529       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 530        * ones complement of the whole register, not just bit 0.
 531        */
 532       emit(XOR(this->result, op[0], fs_reg(1)));
 533       break;
 534    case ir_unop_neg:
 535       op[0].negate = !op[0].negate;
 536       emit(MOV(this->result, op[0]));
 537       break;
 538    case ir_unop_abs:
 539       op[0].abs = true;
 540       op[0].negate = false;
 541       emit(MOV(this->result, op[0]));
 542       break;
 543    case ir_unop_sign:
 544       if (ir->type->is_float()) {
 545          /* AND(val, 0x80000000) gives the sign bit.
 546           *
 547           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 548           * zero.
 549           */
 550          emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 551
 552          op[0].type = BRW_REGISTER_TYPE_UD;
 553          this->result.type = BRW_REGISTER_TYPE_UD;
 554          emit(AND(this->result, op[0], fs_reg(0x80000000u)));
 555
 556          inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
 557          inst->predicate = BRW_PREDICATE_NORMAL;
 558
 559          this->result.type = BRW_REGISTER_TYPE_F;
 560       } else {
 561          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 562           *               -> non-negative val generates 0x00000000.
 563           *  Predicated OR sets 1 if val is positive.
 564           */
 565          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
 566
 567          emit(ASR(this->result, op[0], fs_reg(31)));
 568
 569          inst = emit(OR(this->result, this->result, fs_reg(1)));
 570          inst->predicate = BRW_PREDICATE_NORMAL;
 571       }
 572       break;
 573    case ir_unop_rcp:
 574       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 575       break;
 576
 577    case ir_unop_exp2:
 578       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 579       break;
 580    case ir_unop_log2:
 581       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 582       break;
 583    case ir_unop_exp:
 584    case ir_unop_log:
 585       unreachable("not reached: should be handled by ir_explog_to_explog2");
 586    case ir_unop_sin:
 587    case ir_unop_sin_reduced:
 588       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 589       break;
 590    case ir_unop_cos:
 591    case ir_unop_cos_reduced:
 592       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 593       break;
 594
 595    case ir_unop_dFdx:
 596       emit(FS_OPCODE_DDX, this->result, op[0], fs_reg(BRW_DERIVATIVE_BY_HINT));
 597       break;
 598    case ir_unop_dFdx_coarse:
 599       emit(FS_OPCODE_DDX, this->result, op[0], fs_reg(BRW_DERIVATIVE_COARSE));
 600       break;
 601    case ir_unop_dFdx_fine:
 602       emit(FS_OPCODE_DDX, this->result, op[0], fs_reg(BRW_DERIVATIVE_FINE));
 603       break;
 604    case ir_unop_dFdy:
 605       emit(FS_OPCODE_DDY, this->result, op[0], fs_reg(BRW_DERIVATIVE_BY_HINT));
 606       break;
 607    case ir_unop_dFdy_coarse:
 608       emit(FS_OPCODE_DDY, this->result, op[0], fs_reg(BRW_DERIVATIVE_COARSE));
 609       break;
 610    case ir_unop_dFdy_fine:
 611       emit(FS_OPCODE_DDY, this->result, op[0], fs_reg(BRW_DERIVATIVE_FINE));
 612       break;
 613
 614    case ir_binop_add:
 615       emit(ADD(this->result, op[0], op[1]));
 616       break;
 617    case ir_binop_sub:
 618       unreachable("not reached: should be handled by ir_sub_to_add_neg");
 619
 620    case ir_binop_mul:
 621       if (brw->gen < 8 && ir->type->is_integer()) {
 622          /* For integer multiplication, the MUL uses the low 16 bits
 623           * of one of the operands (src0 on gen6, src1 on gen7).  The
 624           * MACH accumulates in the contribution of the upper 16 bits
 625           * of that operand.
 626           */
 627          if (ir->operands[0]->is_uint16_constant()) {
 628             if (brw->gen < 7)
 629                emit(MUL(this->result, op[0], op[1]));
 630             else
 631                emit(MUL(this->result, op[1], op[0]));
 632          } else if (ir->operands[1]->is_uint16_constant()) {
 633             if (brw->gen < 7)
 634                emit(MUL(this->result, op[1], op[0]));
 635             else
 636                emit(MUL(this->result, op[0], op[1]));
 637          } else {
 638             if (brw->gen >= 7)
 639                no16("SIMD16 explicit accumulator operands unsupported\n");
 640
 641             struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
 642
 643             emit(MUL(acc, op[0], op[1]));
 644             emit(MACH(reg_null_d, op[0], op[1]));
 645             emit(MOV(this->result, fs_reg(acc)));
 646          }
 647       } else {
 648          emit(MUL(this->result, op[0], op[1]));
 649       }
 650       break;
 651    case ir_binop_imul_high: {
 652       if (brw->gen >= 7)
 653          no16("SIMD16 explicit accumulator operands unsupported\n");
 654
 655       struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
 656
 657       emit(MUL(acc, op[0], op[1]));
 658       emit(MACH(this->result, op[0], op[1]));
 659       break;
 660    }
 661    case ir_binop_div:
 662       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 663       assert(ir->type->is_integer());
 664       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 665       break;
 666    case ir_binop_carry: {
 667       if (brw->gen >= 7)
 668          no16("SIMD16 explicit accumulator operands unsupported\n");
 669
 670       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
 671
 672       emit(ADDC(reg_null_ud, op[0], op[1]));
 673       emit(MOV(this->result, fs_reg(acc)));
 674       break;
 675    }
 676    case ir_binop_borrow: {
 677       if (brw->gen >= 7)
 678          no16("SIMD16 explicit accumulator operands unsupported\n");
 679
 680       struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
 681
 682       emit(SUBB(reg_null_ud, op[0], op[1]));
 683       emit(MOV(this->result, fs_reg(acc)));
 684       break;
 685    }
 686    case ir_binop_mod:
 687       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
 688       assert(ir->type->is_integer());
 689       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 690       break;
 691
 692    case ir_binop_less:
 693    case ir_binop_greater:
 694    case ir_binop_lequal:
 695    case ir_binop_gequal:
 696    case ir_binop_equal:
 697    case ir_binop_all_equal:
 698    case ir_binop_nequal:
 699    case ir_binop_any_nequal:
 700       resolve_bool_comparison(ir->operands[0], &op[0]);
 701       resolve_bool_comparison(ir->operands[1], &op[1]);
 702
 703       emit(CMP(this->result, op[0], op[1],
 704                brw_conditional_for_comparison(ir->operation)));
 705       break;
 706
 707    case ir_binop_logic_xor:
 708       emit(XOR(this->result, op[0], op[1]));
 709       break;
 710
 711    case ir_binop_logic_or:
 712       emit(OR(this->result, op[0], op[1]));
 713       break;
 714
 715    case ir_binop_logic_and:
 716       emit(AND(this->result, op[0], op[1]));
 717       break;
 718
 719    case ir_binop_dot:
 720    case ir_unop_any:
 721       unreachable("not reached: should be handled by brw_fs_channel_expressions");
 722
 723    case ir_unop_noise:
 724       unreachable("not reached: should be handled by lower_noise");
 725
 726    case ir_quadop_vector:
 727       unreachable("not reached: should be handled by lower_quadop_vector");
 728
 729    case ir_binop_vector_extract:
 730       unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
 731
 732    case ir_triop_vector_insert:
 733       unreachable("not reached: should be handled by lower_vector_insert()");
 734
 735    case ir_binop_ldexp:
 736       unreachable("not reached: should be handled by ldexp_to_arith()");
 737
 738    case ir_unop_sqrt:
 739       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 740       break;
 741
 742    case ir_unop_rsq:
 743       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 744       break;
 745
 746    case ir_unop_bitcast_i2f:
 747    case ir_unop_bitcast_u2f:
 748       op[0].type = BRW_REGISTER_TYPE_F;
 749       this->result = op[0];
 750       break;
 751    case ir_unop_i2u:
 752    case ir_unop_bitcast_f2u:
 753       op[0].type = BRW_REGISTER_TYPE_UD;
 754       this->result = op[0];
 755       break;
 756    case ir_unop_u2i:
 757    case ir_unop_bitcast_f2i:
 758       op[0].type = BRW_REGISTER_TYPE_D;
 759       this->result = op[0];
 760       break;
 761    case ir_unop_i2f:
 762    case ir_unop_u2f:
 763    case ir_unop_f2i:
 764    case ir_unop_f2u:
 765       emit(MOV(this->result, op[0]));
 766       break;
 767
 768    case ir_unop_b2i:
 769       emit(AND(this->result, op[0], fs_reg(1)));
 770       break;
 771    case ir_unop_b2f:
 772       temp = fs_reg(this, glsl_type::int_type);
 773       emit(AND(temp, op[0], fs_reg(1)));
 774       emit(MOV(this->result, temp));
 775       break;
 776
 777    case ir_unop_f2b:
 778       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 779       break;
 780    case ir_unop_i2b:
 781       emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
 782       break;
 783
 784    case ir_unop_trunc:
 785       emit(RNDZ(this->result, op[0]));
 786       break;
 787    case ir_unop_ceil:
 788       op[0].negate = !op[0].negate;
 789       emit(RNDD(this->result, op[0]));
 790       this->result.negate = true;
 791       break;
 792    case ir_unop_floor:
 793       emit(RNDD(this->result, op[0]));
 794       break;
 795    case ir_unop_fract:
 796       emit(FRC(this->result, op[0]));
 797       break;
 798    case ir_unop_round_even:
 799       emit(RNDE(this->result, op[0]));
 800       break;
 801
 802    case ir_binop_min:
 803    case ir_binop_max:
 804       resolve_ud_negate(&op[0]);
 805       resolve_ud_negate(&op[1]);
 806       emit_minmax(ir->operation == ir_binop_min ?
 807                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
 808                   this->result, op[0], op[1]);
 809       break;
 810    case ir_unop_pack_snorm_2x16:
 811    case ir_unop_pack_snorm_4x8:
 812    case ir_unop_pack_unorm_2x16:
 813    case ir_unop_pack_unorm_4x8:
 814    case ir_unop_unpack_snorm_2x16:
 815    case ir_unop_unpack_snorm_4x8:
 816    case ir_unop_unpack_unorm_2x16:
 817    case ir_unop_unpack_unorm_4x8:
 818    case ir_unop_unpack_half_2x16:
 819    case ir_unop_pack_half_2x16:
 820       unreachable("not reached: should be handled by lower_packing_builtins");
 821    case ir_unop_unpack_half_2x16_split_x:
 822       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
 823       break;
 824    case ir_unop_unpack_half_2x16_split_y:
 825       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
 826       break;
 827    case ir_binop_pow:
 828       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
 829       break;
 830
 831    case ir_unop_bitfield_reverse:
 832       emit(BFREV(this->result, op[0]));
 833       break;
 834    case ir_unop_bit_count:
 835       emit(CBIT(this->result, op[0]));
 836       break;
 837    case ir_unop_find_msb:
 838       temp = fs_reg(this, glsl_type::uint_type);
 839       emit(FBH(temp, op[0]));
 840
 841       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
 842        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
 843        * subtract the result from 31 to convert the MSB count into an LSB count.
 844        */
 845
 846       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
 847       emit(MOV(this->result, temp));
 848       emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
 849
 850       temp.negate = true;
 851       inst = emit(ADD(this->result, temp, fs_reg(31)));
 852       inst->predicate = BRW_PREDICATE_NORMAL;
 853       break;
 854    case ir_unop_find_lsb:
 855       emit(FBL(this->result, op[0]));
 856       break;
 857    case ir_triop_bitfield_extract:
 858       /* Note that the instruction's argument order is reversed from GLSL
 859        * and the IR.
 860        */
 861       emit(BFE(this->result, op[2], op[1], op[0]));
 862       break;
 863    case ir_binop_bfm:
 864       emit(BFI1(this->result, op[0], op[1]));
 865       break;
 866    case ir_triop_bfi:
 867       emit(BFI2(this->result, op[0], op[1], op[2]));
 868       break;
 869    case ir_quadop_bitfield_insert:
 870       unreachable("not reached: should be handled by "
 871               "lower_instructions::bitfield_insert_to_bfm_bfi");
 872
 873    case ir_unop_bit_not:
 874       emit(NOT(this->result, op[0]));
 875       break;
 876    case ir_binop_bit_and:
 877       emit(AND(this->result, op[0], op[1]));
 878       break;
 879    case ir_binop_bit_xor:
 880       emit(XOR(this->result, op[0], op[1]));
 881       break;
 882    case ir_binop_bit_or:
 883       emit(OR(this->result, op[0], op[1]));
 884       break;
 885
 886    case ir_binop_lshift:
 887       emit(SHL(this->result, op[0], op[1]));
 888       break;
 889
 890    case ir_binop_rshift:
 891       if (ir->type->base_type == GLSL_TYPE_INT)
 892          emit(ASR(this->result, op[0], op[1]));
 893       else
 894          emit(SHR(this->result, op[0], op[1]));
 895       break;
 896    case ir_binop_pack_half_2x16_split:
 897       emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
 898       break;
 899    case ir_binop_ubo_load: {
 900       /* This IR node takes a constant uniform block and a constant or
 901        * variable byte offset within the block and loads a vector from that.
 902        */
 903       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
 904       ir_constant *const_offset = ir->operands[1]->as_constant();
 905       fs_reg surf_index;
 906
 907       if (const_uniform_block) {
 908          /* The block index is a constant, so just emit the binding table entry
 909           * as an immediate.
 910           */
 911          surf_index = fs_reg(prog_data->base.binding_table.ubo_start +
 912                                  const_uniform_block->value.u[0]);
 913       } else {
 914          /* The block index is not a constant. Evaluate the index expression
 915           * per-channel and add the base UBO index; the generator will select
 916           * a value from any live channel.
 917           */
 918          surf_index = fs_reg(this, glsl_type::uint_type);
 919          emit(ADD(surf_index, op[0],
 920                   fs_reg(prog_data->base.binding_table.ubo_start)))
 921             ->force_writemask_all = true;
 922
 923          /* Assume this may touch any UBO. It would be nice to provide
 924           * a tighter bound, but the array information is already lowered away.
 925           */
 926          brw_mark_surface_used(&prog_data->base,
 927                                prog_data->base.binding_table.ubo_start +
 928                                shader_prog->NumUniformBlocks - 1);
 929       }
 930
 931       if (const_offset) {
 932          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
 933          packed_consts.type = result.type;
 934
 935          fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
 936          emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
 937                                    packed_consts, surf_index, const_offset_reg));
 938
 939          for (int i = 0; i < ir->type->vector_elements; i++) {
 940             packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
 941
 942             /* The std140 packing rules don't allow vectors to cross 16-byte
 943              * boundaries, and a reg is 32 bytes.
 944              */
 945             assert(packed_consts.subreg_offset < 32);
 946
 947             /* UBO bools are any nonzero value.  We consider bools to be
 948              * values with the low bit set to 1.  Convert them using CMP.
 949              */
 950             if (ir->type->base_type == GLSL_TYPE_BOOL) {
 951                emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
 952             } else {
 953                emit(MOV(result, packed_consts));
 954             }
 955
 956             result.reg_offset++;
 957          }
 958       } else {
 959          /* Turn the byte offset into a dword offset. */
 960          fs_reg base_offset = fs_reg(this, glsl_type::int_type);
 961          emit(SHR(base_offset, op[1], fs_reg(2)));
 962
 963          for (int i = 0; i < ir->type->vector_elements; i++) {
 964             emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
 965                                             base_offset, i));
 966
 967             if (ir->type->base_type == GLSL_TYPE_BOOL)
 968                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
 969
 970             result.reg_offset++;
 971          }
 972       }
 973
 974       result.reg_offset = 0;
 975       break;
 976    }
 977
 978    case ir_triop_fma:
 979       /* Note that the instruction's argument order is reversed from GLSL
 980        * and the IR.
 981        */
 982       emit(MAD(this->result, op[2], op[1], op[0]));
 983       break;
 984
 985    case ir_triop_lrp:
 986       emit_lrp(this->result, op[0], op[1], op[2]);
 987       break;
 988
 989    case ir_triop_csel:
 990       emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
 991       inst = emit(BRW_OPCODE_SEL, this->result, op[1], op[2]);
 992       inst->predicate = BRW_PREDICATE_NORMAL;
 993       break;
 994
 995    case ir_unop_interpolate_at_centroid:
 996    case ir_binop_interpolate_at_offset:
 997    case ir_binop_interpolate_at_sample:
 998       unreachable("already handled above");
 999       break;
1000    }
1001 }
1002
1003 void
1004 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1005                                    const glsl_type *type, bool predicated)
1006 {
1007    switch (type->base_type) {
1008    case GLSL_TYPE_FLOAT:
1009    case GLSL_TYPE_UINT:
1010    case GLSL_TYPE_INT:
1011    case GLSL_TYPE_BOOL:
1012       for (unsigned int i = 0; i < type->components(); i++) {
1013          l.type = brw_type_for_base_type(type);
1014          r.type = brw_type_for_base_type(type);
1015
1016          if (predicated || !l.equals(r)) {
1017             fs_inst *inst = emit(MOV(l, r));
1018             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
1019          }
1020
1021          l.reg_offset++;
1022          r.reg_offset++;
1023       }
1024       break;
1025    case GLSL_TYPE_ARRAY:
1026       for (unsigned int i = 0; i < type->length; i++) {
1027          emit_assignment_writes(l, r, type->fields.array, predicated);
1028       }
1029       break;
1030
1031    case GLSL_TYPE_STRUCT:
1032       for (unsigned int i = 0; i < type->length; i++) {
1033          emit_assignment_writes(l, r, type->fields.structure[i].type,
1034                                 predicated);
1035       }
1036       break;
1037
1038    case GLSL_TYPE_SAMPLER:
1039    case GLSL_TYPE_IMAGE:
1040    case GLSL_TYPE_ATOMIC_UINT:
1041       break;
1042
1043    case GLSL_TYPE_VOID:
1044    case GLSL_TYPE_ERROR:
1045    case GLSL_TYPE_INTERFACE:
1046       unreachable("not reached");
1047    }
1048 }
1049
1050 /* If the RHS processing resulted in an instruction generating a
1051  * temporary value, and it would be easy to rewrite the instruction to
1052  * generate its result right into the LHS instead, do so.  This ends
1053  * up reliably removing instructions where it can be tricky to do so
1054  * later without real UD chain information.
1055  */
1056 bool
1057 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1058                                    fs_reg dst,
1059                                    fs_reg src,
1060                                    fs_inst *pre_rhs_inst,
1061                                    fs_inst *last_rhs_inst)
1062 {
1063    /* Only attempt if we're doing a direct assignment. */
1064    if (ir->condition ||
1065        !(ir->lhs->type->is_scalar() ||
1066         (ir->lhs->type->is_vector() &&
1067          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
1068       return false;
1069
1070    /* Make sure the last instruction generated our source reg. */
1071    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
1072                                                     last_rhs_inst,
1073                                                     src);
1074    if (!modify)
1075       return false;
1076
1077    /* If last_rhs_inst wrote a different number of components than our LHS,
1078     * we can't safely rewrite it.
1079     */
1080    if (virtual_grf_sizes[dst.reg] != modify->regs_written)
1081       return false;
1082
1083    /* Success!  Rewrite the instruction. */
1084    modify->dst = dst;
1085
1086    return true;
1087 }
1088
1089 void
1090 fs_visitor::visit(ir_assignment *ir)
1091 {
1092    fs_reg l, r;
1093    fs_inst *inst;
1094
1095    /* FINISHME: arrays on the lhs */
1096    ir->lhs->accept(this);
1097    l = this->result;
1098
1099    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
1100
1101    ir->rhs->accept(this);
1102    r = this->result;
1103
1104    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
1105
1106    assert(l.file != BAD_FILE);
1107    assert(r.file != BAD_FILE);
1108
1109    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
1110       return;
1111
1112    if (ir->condition) {
1113       emit_bool_to_cond_code(ir->condition);
1114    }
1115
1116    if (ir->lhs->type->is_scalar() ||
1117        ir->lhs->type->is_vector()) {
1118       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1119          if (ir->write_mask & (1 << i)) {
1120             inst = emit(MOV(l, r));
1121             if (ir->condition)
1122                inst->predicate = BRW_PREDICATE_NORMAL;
1123             r.reg_offset++;
1124          }
1125          l.reg_offset++;
1126       }
1127    } else {
1128       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1129    }
1130 }
1131
1132 fs_inst *
1133 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1134                               fs_reg shadow_c, fs_reg lod, fs_reg dPdy,
1135                               uint32_t sampler)
1136 {
1137    int mlen;
1138    int base_mrf = 1;
1139    bool simd16 = false;
1140    fs_reg orig_dst;
1141
1142    /* g0 header. */
1143    mlen = 1;
1144
1145    if (ir->shadow_comparitor) {
1146       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1147          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1148          coordinate.reg_offset++;
1149       }
1150
1151       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
1152        * the unused slots must be zeroed.
1153        */
1154       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
1155          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1156       }
1157       mlen += 3;
1158
1159       if (ir->op == ir_tex) {
1160          /* There's no plain shadow compare message, so we use shadow
1161           * compare with a bias of 0.0.
1162           */
1163          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
1164          mlen++;
1165       } else if (ir->op == ir_txb || ir->op == ir_txl) {
1166          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1167          mlen++;
1168       } else {
1169          unreachable("Should not get here.");
1170       }
1171
1172       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1173       mlen++;
1174    } else if (ir->op == ir_tex) {
1175       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1176          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1177          coordinate.reg_offset++;
1178       }
1179       /* zero the others. */
1180       for (int i = ir->coordinate->type->vector_elements; i<3; i++) {
1181          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1182       }
1183       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1184       mlen += 3;
1185    } else if (ir->op == ir_txd) {
1186       fs_reg &dPdx = lod;
1187
1188       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1189          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1190          coordinate.reg_offset++;
1191       }
1192       /* the slots for u and v are always present, but r is optional */
1193       mlen += MAX2(ir->coordinate->type->vector_elements, 2);
1194
1195       /*  P   = u, v, r
1196        * dPdx = dudx, dvdx, drdx
1197        * dPdy = dudy, dvdy, drdy
1198        *
1199        * 1-arg: Does not exist.
1200        *
1201        * 2-arg: dudx   dvdx   dudy   dvdy
1202        *        dPdx.x dPdx.y dPdy.x dPdy.y
1203        *        m4     m5     m6     m7
1204        *
1205        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
1206        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1207        *        m5     m6     m7     m8     m9     m10
1208        */
1209       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1210          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1211          dPdx.reg_offset++;
1212       }
1213       mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
1214
1215       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
1216          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1217          dPdy.reg_offset++;
1218       }
1219       mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
1220    } else if (ir->op == ir_txs) {
1221       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
1222       simd16 = true;
1223       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1224       mlen += 2;
1225    } else {
1226       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1227        * instructions.  We'll need to do SIMD16 here.
1228        */
1229       simd16 = true;
1230       assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
1231
1232       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1233          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1234                   coordinate));
1235          coordinate.reg_offset++;
1236       }
1237
1238       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
1239        * be necessary for TXF (ld), but seems wise to do for all messages.
1240        */
1241       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
1242          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1243       }
1244
1245       /* lod/bias appears after u/v/r. */
1246       mlen += 6;
1247
1248       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1249       mlen++;
1250
1251       /* The unused upper half. */
1252       mlen++;
1253    }
1254
1255    if (simd16) {
1256       /* Now, since we're doing simd16, the return is 2 interleaved
1257        * vec4s where the odd-indexed ones are junk. We'll need to move
1258        * this weirdness around to the expected layout.
1259        */
1260       orig_dst = dst;
1261       dst = fs_reg(GRF, virtual_grf_alloc(8),
1262                    (brw->is_g4x ?
1263                     brw_type_for_base_type(ir->type) :
1264                     BRW_REGISTER_TYPE_F));
1265    }
1266
1267    enum opcode opcode;
1268
1269    switch (ir->op) {
1270    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1271    case ir_txb: opcode = FS_OPCODE_TXB; break;
1272    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1273    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1274    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1275    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1276    default:
1277       unreachable("not reached");
1278    }
1279
1280    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1281    inst->base_mrf = base_mrf;
1282    inst->mlen = mlen;
1283    inst->header_present = true;
1284    inst->regs_written = simd16 ? 8 : 4;
1285
1286    if (simd16) {
1287       for (int i = 0; i < 4; i++) {
1288          emit(MOV(orig_dst, dst));
1289          orig_dst.reg_offset++;
1290          dst.reg_offset += 2;
1291       }
1292    }
1293
1294    return inst;
1295 }
1296
1297 /* gen5's sampler has slots for u, v, r, array index, then optional
1298  * parameters like shadow comparitor or LOD bias.  If optional
1299  * parameters aren't present, those base slots are optional and don't
1300  * need to be included in the message.
1301  *
1302  * We don't fill in the unnecessary slots regardless, which may look
1303  * surprising in the disassembly.
1304  */
1305 fs_inst *
1306 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1307                               fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1308                               fs_reg sample_index, uint32_t sampler)
1309 {
1310    int mlen = 0;
1311    int base_mrf = 2;
1312    int reg_width = dispatch_width / 8;
1313    bool header_present = false;
1314    const int vector_elements =
1315       ir->coordinate ? ir->coordinate->type->vector_elements : 0;
1316
1317    if (ir->offset) {
1318       /* The offsets set up by the ir_texture visitor are in the
1319        * m1 header, so we can't go headerless.
1320        */
1321       header_present = true;
1322       mlen++;
1323       base_mrf--;
1324    }
1325
1326    for (int i = 0; i < vector_elements; i++) {
1327       emit(MOV(fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
1328                coordinate));
1329       coordinate.reg_offset++;
1330    }
1331    mlen += vector_elements * reg_width;
1332
1333    if (ir->shadow_comparitor) {
1334       mlen = MAX2(mlen, header_present + 4 * reg_width);
1335
1336       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1337       mlen += reg_width;
1338    }
1339
1340    enum opcode opcode;
1341    switch (ir->op) {
1342    case ir_tex:
1343       opcode = SHADER_OPCODE_TEX;
1344       break;
1345    case ir_txb:
1346       mlen = MAX2(mlen, header_present + 4 * reg_width);
1347       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1348       mlen += reg_width;
1349
1350       opcode = FS_OPCODE_TXB;
1351       break;
1352    case ir_txl:
1353       mlen = MAX2(mlen, header_present + 4 * reg_width);
1354       emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1355       mlen += reg_width;
1356
1357       opcode = SHADER_OPCODE_TXL;
1358       break;
1359    case ir_txd: {
1360       mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
1361
1362       /**
1363        *  P   =  u,    v,    r
1364        * dPdx = dudx, dvdx, drdx
1365        * dPdy = dudy, dvdy, drdy
1366        *
1367        * Load up these values:
1368        * - dudx   dudy   dvdx   dvdy   drdx   drdy
1369        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1370        */
1371       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1372          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1373          lod.reg_offset++;
1374          mlen += reg_width;
1375
1376          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod2));
1377          lod2.reg_offset++;
1378          mlen += reg_width;
1379       }
1380
1381       opcode = SHADER_OPCODE_TXD;
1382       break;
1383    }
1384    case ir_txs:
1385       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1386       mlen += reg_width;
1387
1388       opcode = SHADER_OPCODE_TXS;
1389       break;
1390    case ir_query_levels:
1391       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1392       mlen += reg_width;
1393
1394       opcode = SHADER_OPCODE_TXS;
1395       break;
1396    case ir_txf:
1397       mlen = header_present + 4 * reg_width;
1398       emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), lod));
1399
1400       opcode = SHADER_OPCODE_TXF;
1401       break;
1402    case ir_txf_ms:
1403       mlen = header_present + 4 * reg_width;
1404
1405       /* lod */
1406       emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), fs_reg(0)));
1407       /* sample index */
1408       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), sample_index));
1409       mlen += reg_width;
1410
1411       opcode = SHADER_OPCODE_TXF_CMS;
1412       break;
1413    case ir_lod:
1414       opcode = SHADER_OPCODE_LOD;
1415       break;
1416    case ir_tg4:
1417       opcode = SHADER_OPCODE_TG4;
1418       break;
1419    default:
1420       unreachable("not reached");
1421    }
1422
1423    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1424    inst->base_mrf = base_mrf;
1425    inst->mlen = mlen;
1426    inst->header_present = header_present;
1427    inst->regs_written = 4;
1428
1429    if (mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1430       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1431            " disallowed by hardware\n");
1432    }
1433
1434    return inst;
1435 }
1436
1437 static bool
1438 is_high_sampler(struct brw_context *brw, fs_reg sampler)
1439 {
1440    if (brw->gen < 8 && !brw->is_haswell)
1441       return false;
1442
1443    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
1444 }
1445
1446 fs_inst *
1447 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1448                               fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1449                               fs_reg sample_index, fs_reg mcs, fs_reg sampler)
1450 {
1451    int reg_width = dispatch_width / 8;
1452    bool header_present = false;
1453
1454    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
1455    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
1456       sources[i] = fs_reg(this, glsl_type::float_type);
1457    }
1458    int length = 0;
1459
1460    if (ir->op == ir_tg4 || (ir->offset && ir->op != ir_txf) ||
1461        is_high_sampler(brw, sampler)) {
1462       /* For general texture offsets (no txf workaround), we need a header to
1463        * put them in.  Note that for SIMD16 we're making space for two actual
1464        * hardware registers here, so the emit will have to fix up for this.
1465        *
1466        * * ir4_tg4 needs to place its channel select in the header,
1467        * for interaction with ARB_texture_swizzle
1468        *
1469        * The sampler index is only 4-bits, so for larger sampler numbers we
1470        * need to offset the Sampler State Pointer in the header.
1471        */
1472       header_present = true;
1473       sources[length] = reg_undef;
1474       length++;
1475    }
1476
1477    if (ir->shadow_comparitor) {
1478       emit(MOV(sources[length], shadow_c));
1479       length++;
1480    }
1481
1482    bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
1483    bool coordinate_done = false;
1484
1485    /* Set up the LOD info */
1486    switch (ir->op) {
1487    case ir_tex:
1488    case ir_lod:
1489       break;
1490    case ir_txb:
1491       emit(MOV(sources[length], lod));
1492       length++;
1493       break;
1494    case ir_txl:
1495       emit(MOV(sources[length], lod));
1496       length++;
1497       break;
1498    case ir_txd: {
1499       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1500
1501       /* Load dPdx and the coordinate together:
1502        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1503        */
1504       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1505          emit(MOV(sources[length], coordinate));
1506          coordinate.reg_offset++;
1507          length++;
1508
1509          /* For cube map array, the coordinate is (u,v,r,ai) but there are
1510           * only derivatives for (u, v, r).
1511           */
1512          if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
1513             emit(MOV(sources[length], lod));
1514             lod.reg_offset++;
1515             length++;
1516
1517             emit(MOV(sources[length], lod2));
1518             lod2.reg_offset++;
1519             length++;
1520          }
1521       }
1522
1523       coordinate_done = true;
1524       break;
1525    }
1526    case ir_txs:
1527       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
1528       length++;
1529       break;
1530    case ir_query_levels:
1531       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1532       length++;
1533       break;
1534    case ir_txf:
1535       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1536       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1537       coordinate.reg_offset++;
1538       length++;
1539
1540       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
1541       length++;
1542
1543       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1544          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1545          coordinate.reg_offset++;
1546          length++;
1547       }
1548
1549       coordinate_done = true;
1550       break;
1551    case ir_txf_ms:
1552       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
1553       length++;
1554
1555       /* data from the multisample control surface */
1556       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
1557       length++;
1558
1559       /* there is no offsetting for this message; just copy in the integer
1560        * texture coordinates
1561        */
1562       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1563          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1564          coordinate.reg_offset++;
1565          length++;
1566       }
1567
1568       coordinate_done = true;
1569       break;
1570    case ir_tg4:
1571       if (has_nonconstant_offset) {
1572          if (ir->shadow_comparitor)
1573             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
1574
1575          /* More crazy intermixing */
1576          ir->offset->accept(this);
1577          fs_reg offset_value = this->result;
1578
1579          for (int i = 0; i < 2; i++) { /* u, v */
1580             emit(MOV(sources[length], coordinate));
1581             coordinate.reg_offset++;
1582             length++;
1583          }
1584
1585          for (int i = 0; i < 2; i++) { /* offu, offv */
1586             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
1587             offset_value.reg_offset++;
1588             length++;
1589          }
1590
1591          if (ir->coordinate->type->vector_elements == 3) { /* r if present */
1592             emit(MOV(sources[length], coordinate));
1593             coordinate.reg_offset++;
1594             length++;
1595          }
1596
1597          coordinate_done = true;
1598       }
1599       break;
1600    }
1601
1602    /* Set up the coordinate (except for cases where it was done above) */
1603    if (ir->coordinate && !coordinate_done) {
1604       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1605          emit(MOV(sources[length], coordinate));
1606          coordinate.reg_offset++;
1607          length++;
1608       }
1609    }
1610
1611    fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(length),
1612                                BRW_REGISTER_TYPE_F);
1613    emit(LOAD_PAYLOAD(src_payload, sources, length));
1614
1615    /* Generate the SEND */
1616    enum opcode opcode;
1617    switch (ir->op) {
1618    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1619    case ir_txb: opcode = FS_OPCODE_TXB; break;
1620    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1621    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1622    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1623    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
1624    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1625    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
1626    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
1627    case ir_tg4:
1628       if (has_nonconstant_offset)
1629          opcode = SHADER_OPCODE_TG4_OFFSET;
1630       else
1631          opcode = SHADER_OPCODE_TG4;
1632       break;
1633    default:
1634       unreachable("not reached");
1635    }
1636    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
1637    inst->base_mrf = -1;
1638    if (reg_width == 2)
1639       inst->mlen = length * reg_width - header_present;
1640    else
1641       inst->mlen = length * reg_width;
1642    inst->header_present = header_present;
1643    inst->regs_written = 4;
1644
1645    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1646       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1647            " disallowed by hardware\n");
1648    }
1649
1650    return inst;
1651 }
1652
1653 fs_reg
1654 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
1655                              bool is_rect, uint32_t sampler, int texunit)
1656 {
1657    fs_inst *inst = NULL;
1658    bool needs_gl_clamp = true;
1659    fs_reg scale_x, scale_y;
1660
1661    /* The 965 requires the EU to do the normalization of GL rectangle
1662     * texture coordinates.  We use the program parameter state
1663     * tracking to get the scaling factor.
1664     */
1665    if (is_rect &&
1666        (brw->gen < 6 ||
1667         (brw->gen >= 6 && (key->tex.gl_clamp_mask[0] & (1 << sampler) ||
1668                            key->tex.gl_clamp_mask[1] & (1 << sampler))))) {
1669       struct gl_program_parameter_list *params = prog->Parameters;
1670       int tokens[STATE_LENGTH] = {
1671          STATE_INTERNAL,
1672          STATE_TEXRECT_SCALE,
1673          texunit,
1674          0,
1675          0
1676       };
1677
1678       no16("rectangle scale uniform setup not supported on SIMD16\n");
1679       if (dispatch_width == 16) {
1680          return coordinate;
1681       }
1682
1683       GLuint index = _mesa_add_state_reference(params,
1684                                                (gl_state_index *)tokens);
1685       /* Try to find existing copies of the texrect scale uniforms. */
1686       for (unsigned i = 0; i < uniforms; i++) {
1687          if (stage_prog_data->param[i] ==
1688              &prog->Parameters->ParameterValues[index][0]) {
1689             scale_x = fs_reg(UNIFORM, i);
1690             scale_y = fs_reg(UNIFORM, i + 1);
1691             break;
1692          }
1693       }
1694
1695       /* If we didn't already set them up, do so now. */
1696       if (scale_x.file == BAD_FILE) {
1697          scale_x = fs_reg(UNIFORM, uniforms);
1698          scale_y = fs_reg(UNIFORM, uniforms + 1);
1699
1700          stage_prog_data->param[uniforms++] =
1701             &prog->Parameters->ParameterValues[index][0];
1702          stage_prog_data->param[uniforms++] =
1703             &prog->Parameters->ParameterValues[index][1];
1704       }
1705    }
1706
1707    /* The 965 requires the EU to do the normalization of GL rectangle
1708     * texture coordinates.  We use the program parameter state
1709     * tracking to get the scaling factor.
1710     */
1711    if (brw->gen < 6 && is_rect) {
1712       fs_reg dst = fs_reg(this, ir->coordinate->type);
1713       fs_reg src = coordinate;
1714       coordinate = dst;
1715
1716       emit(MUL(dst, src, scale_x));
1717       dst.reg_offset++;
1718       src.reg_offset++;
1719       emit(MUL(dst, src, scale_y));
1720    } else if (is_rect) {
1721       /* On gen6+, the sampler handles the rectangle coordinates
1722        * natively, without needing rescaling.  But that means we have
1723        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1724        * not [0, 1] like the default case below.
1725        */
1726       needs_gl_clamp = false;
1727
1728       for (int i = 0; i < 2; i++) {
1729          if (key->tex.gl_clamp_mask[i] & (1 << sampler)) {
1730             fs_reg chan = coordinate;
1731             chan.reg_offset += i;
1732
1733             inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
1734             inst->conditional_mod = BRW_CONDITIONAL_G;
1735
1736             /* Our parameter comes in as 1.0/width or 1.0/height,
1737              * because that's what people normally want for doing
1738              * texture rectangle handling.  We need width or height
1739              * for clamping, but we don't care enough to make a new
1740              * parameter type, so just invert back.
1741              */
1742             fs_reg limit = fs_reg(this, glsl_type::float_type);
1743             emit(MOV(limit, i == 0 ? scale_x : scale_y));
1744             emit(SHADER_OPCODE_RCP, limit, limit);
1745
1746             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1747             inst->conditional_mod = BRW_CONDITIONAL_L;
1748          }
1749       }
1750    }
1751
1752    if (ir->coordinate && needs_gl_clamp) {
1753       for (unsigned int i = 0;
1754            i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1755          if (key->tex.gl_clamp_mask[i] & (1 << sampler)) {
1756             fs_reg chan = coordinate;
1757             chan.reg_offset += i;
1758
1759             fs_inst *inst = emit(MOV(chan, chan));
1760             inst->saturate = true;
1761          }
1762       }
1763    }
1764    return coordinate;
1765 }
1766
1767 /* Sample from the MCS surface attached to this multisample texture. */
1768 fs_reg
1769 fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
1770 {
1771    int reg_width = dispatch_width / 8;
1772    int length = ir->coordinate->type->vector_elements;
1773    fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length),
1774                            BRW_REGISTER_TYPE_F);
1775    fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
1776    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, length);
1777
1778    /* parameters are: u, v, r; missing parameters are treated as zero */
1779    for (int i = 0; i < length; i++) {
1780       sources[i] = fs_reg(this, glsl_type::float_type);
1781       emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
1782       coordinate.reg_offset++;
1783    }
1784
1785    emit(LOAD_PAYLOAD(payload, sources, length));
1786
1787    fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
1788    inst->base_mrf = -1;
1789    inst->mlen = length * reg_width;
1790    inst->header_present = false;
1791    inst->regs_written = 4; /* we only care about one reg of response,
1792                             * but the sampler always writes 4/8
1793                             */
1794
1795    return dest;
1796 }
1797
1798 void
1799 fs_visitor::visit(ir_texture *ir)
1800 {
1801    fs_inst *inst = NULL;
1802
1803    uint32_t sampler =
1804       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
1805
1806    ir_rvalue *nonconst_sampler_index =
1807       _mesa_get_sampler_array_nonconst_index(ir->sampler);
1808
1809    /* Handle non-constant sampler array indexing */
1810    fs_reg sampler_reg;
1811    if (nonconst_sampler_index) {
1812       /* The highest sampler which may be used by this operation is
1813        * the last element of the array. Mark it here, because the generator
1814        * doesn't have enough information to determine the bound.
1815        */
1816       uint32_t array_size = ir->sampler->as_dereference_array()
1817          ->array->type->array_size();
1818
1819       uint32_t max_used = sampler + array_size - 1;
1820       if (ir->op == ir_tg4 && brw->gen < 8) {
1821          max_used += prog_data->base.binding_table.gather_texture_start;
1822       } else {
1823          max_used += prog_data->base.binding_table.texture_start;
1824       }
1825
1826       brw_mark_surface_used(&prog_data->base, max_used);
1827
1828       /* Emit code to evaluate the actual indexing expression */
1829       nonconst_sampler_index->accept(this);
1830       fs_reg temp(this, glsl_type::uint_type);
1831       emit(ADD(temp, this->result, fs_reg(sampler)))
1832             ->force_writemask_all = true;
1833       sampler_reg = temp;
1834    } else {
1835       /* Single sampler, or constant array index; the indexing expression
1836        * is just an immediate.
1837        */
1838       sampler_reg = fs_reg(sampler);
1839    }
1840
1841    /* FINISHME: We're failing to recompile our programs when the sampler is
1842     * updated.  This only matters for the texture rectangle scale parameters
1843     * (pre-gen6, or gen6+ with GL_CLAMP).
1844     */
1845    int texunit = prog->SamplerUnits[sampler];
1846
1847    if (ir->op == ir_tg4) {
1848       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
1849        * emitting anything other than setting up the constant result.
1850        */
1851       ir_constant *chan = ir->lod_info.component->as_constant();
1852       int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
1853       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
1854
1855          fs_reg res = fs_reg(this, glsl_type::vec4_type);
1856          this->result = res;
1857
1858          for (int i=0; i<4; i++) {
1859             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
1860             res.reg_offset++;
1861          }
1862          return;
1863       }
1864    }
1865
1866    /* Should be lowered by do_lower_texture_projection */
1867    assert(!ir->projector);
1868
1869    /* Should be lowered */
1870    assert(!ir->offset || !ir->offset->type->is_array());
1871
1872    /* Generate code to compute all the subexpression trees.  This has to be
1873     * done before loading any values into MRFs for the sampler message since
1874     * generating these values may involve SEND messages that need the MRFs.
1875     */
1876    fs_reg coordinate;
1877    if (ir->coordinate) {
1878       ir->coordinate->accept(this);
1879
1880       coordinate = rescale_texcoord(ir, this->result,
1881                                     ir->sampler->type->sampler_dimensionality ==
1882                                     GLSL_SAMPLER_DIM_RECT,
1883                                     sampler, texunit);
1884    }
1885
1886    fs_reg shadow_comparitor;
1887    if (ir->shadow_comparitor) {
1888       ir->shadow_comparitor->accept(this);
1889       shadow_comparitor = this->result;
1890    }
1891
1892    fs_reg lod, lod2, sample_index, mcs;
1893    switch (ir->op) {
1894    case ir_tex:
1895    case ir_lod:
1896    case ir_tg4:
1897    case ir_query_levels:
1898       break;
1899    case ir_txb:
1900       ir->lod_info.bias->accept(this);
1901       lod = this->result;
1902       break;
1903    case ir_txd:
1904       ir->lod_info.grad.dPdx->accept(this);
1905       lod = this->result;
1906
1907       ir->lod_info.grad.dPdy->accept(this);
1908       lod2 = this->result;
1909       break;
1910    case ir_txf:
1911    case ir_txl:
1912    case ir_txs:
1913       ir->lod_info.lod->accept(this);
1914       lod = this->result;
1915       break;
1916    case ir_txf_ms:
1917       ir->lod_info.sample_index->accept(this);
1918       sample_index = this->result;
1919
1920       if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
1921          mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
1922       else
1923          mcs = fs_reg(0u);
1924       break;
1925    default:
1926       unreachable("Unrecognized texture opcode");
1927    };
1928
1929    /* Writemasking doesn't eliminate channels on SIMD8 texture
1930     * samples, so don't worry about them.
1931     */
1932    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1933
1934    if (brw->gen >= 7) {
1935       inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1936                                lod, lod2, sample_index, mcs, sampler_reg);
1937    } else if (brw->gen >= 5) {
1938       inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1939                                lod, lod2, sample_index, sampler);
1940    } else {
1941       inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1942                                lod, lod2, sampler);
1943    }
1944
1945    if (ir->offset != NULL && ir->op != ir_txf)
1946       inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
1947
1948    if (ir->op == ir_tg4)
1949       inst->texture_offset |= gather_channel(ir, sampler) << 16; // M0.2:16-17
1950
1951    if (ir->shadow_comparitor)
1952       inst->shadow_compare = true;
1953
1954    /* fixup #layers for cube map arrays */
1955    if (ir->op == ir_txs) {
1956       glsl_type const *type = ir->sampler->type;
1957       if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
1958           type->sampler_array) {
1959          fs_reg depth = dst;
1960          depth.reg_offset = 2;
1961          fs_reg fixed_depth = fs_reg(this, glsl_type::int_type);
1962          emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
1963
1964          fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
1965          fs_reg d = dst;
1966          for (int i = 0; i < inst->regs_written; i++) {
1967             if (i == 2) {
1968                fixed_payload[i] = fixed_depth;
1969             } else {
1970                d.reg_offset = i;
1971                fixed_payload[i] = d;
1972             }
1973          }
1974          emit(LOAD_PAYLOAD(dst, fixed_payload, inst->regs_written));
1975       }
1976    }
1977
1978    if (brw->gen == 6 && ir->op == ir_tg4) {
1979       emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], dst);
1980    }
1981
1982    swizzle_result(ir, dst, sampler);
1983 }
1984
1985 /**
1986  * Apply workarounds for Gen6 gather with UINT/SINT
1987  */
1988 void
1989 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
1990 {
1991    if (!wa)
1992       return;
1993
1994    int width = (wa & WA_8BIT) ? 8 : 16;
1995
1996    for (int i = 0; i < 4; i++) {
1997       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
1998       /* Convert from UNORM to UINT */
1999       emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
2000       emit(MOV(dst, dst_f));
2001
2002       if (wa & WA_SIGN) {
2003          /* Reinterpret the UINT value as a signed INT value by
2004           * shifting the sign bit into place, then shifting back
2005           * preserving sign.
2006           */
2007          emit(SHL(dst, dst, fs_reg(32 - width)));
2008          emit(ASR(dst, dst, fs_reg(32 - width)));
2009       }
2010
2011       dst.reg_offset++;
2012    }
2013 }
2014
2015 /**
2016  * Set up the gather channel based on the swizzle, for gather4.
2017  */
2018 uint32_t
2019 fs_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2020 {
2021    ir_constant *chan = ir->lod_info.component->as_constant();
2022    int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2023    switch (swiz) {
2024       case SWIZZLE_X: return 0;
2025       case SWIZZLE_Y:
2026          /* gather4 sampler is broken for green channel on RG32F --
2027           * we must ask for blue instead.
2028           */
2029          if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2030             return 2;
2031          return 1;
2032       case SWIZZLE_Z: return 2;
2033       case SWIZZLE_W: return 3;
2034       default:
2035          unreachable("Not reached"); /* zero, one swizzles handled already */
2036    }
2037 }
2038
2039 /**
2040  * Swizzle the result of a texture result.  This is necessary for
2041  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
2042  */
2043 void
2044 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, uint32_t sampler)
2045 {
2046    if (ir->op == ir_query_levels) {
2047       /* # levels is in .w */
2048       orig_val.reg_offset += 3;
2049       this->result = orig_val;
2050       return;
2051    }
2052
2053    this->result = orig_val;
2054
2055    /* txs,lod don't actually sample the texture, so swizzling the result
2056     * makes no sense.
2057     */
2058    if (ir->op == ir_txs || ir->op == ir_lod || ir->op == ir_tg4)
2059       return;
2060
2061    if (ir->type == glsl_type::float_type) {
2062       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
2063       assert(ir->sampler->type->sampler_shadow);
2064    } else if (key->tex.swizzles[sampler] != SWIZZLE_NOOP) {
2065       fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
2066
2067       for (int i = 0; i < 4; i++) {
2068          int swiz = GET_SWZ(key->tex.swizzles[sampler], i);
2069          fs_reg l = swizzled_result;
2070          l.reg_offset += i;
2071
2072          if (swiz == SWIZZLE_ZERO) {
2073             emit(MOV(l, fs_reg(0.0f)));
2074          } else if (swiz == SWIZZLE_ONE) {
2075             emit(MOV(l, fs_reg(1.0f)));
2076          } else {
2077             fs_reg r = orig_val;
2078             r.reg_offset += GET_SWZ(key->tex.swizzles[sampler], i);
2079             emit(MOV(l, r));
2080          }
2081       }
2082       this->result = swizzled_result;
2083    }
2084 }
2085
2086 void
2087 fs_visitor::visit(ir_swizzle *ir)
2088 {
2089    ir->val->accept(this);
2090    fs_reg val = this->result;
2091
2092    if (ir->type->vector_elements == 1) {
2093       this->result.reg_offset += ir->mask.x;
2094       return;
2095    }
2096
2097    fs_reg result = fs_reg(this, ir->type);
2098    this->result = result;
2099
2100    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
2101       fs_reg channel = val;
2102       int swiz = 0;
2103
2104       switch (i) {
2105       case 0:
2106          swiz = ir->mask.x;
2107          break;
2108       case 1:
2109          swiz = ir->mask.y;
2110          break;
2111       case 2:
2112          swiz = ir->mask.z;
2113          break;
2114       case 3:
2115          swiz = ir->mask.w;
2116          break;
2117       }
2118
2119       channel.reg_offset += swiz;
2120       emit(MOV(result, channel));
2121       result.reg_offset++;
2122    }
2123 }
2124
2125 void
2126 fs_visitor::visit(ir_discard *ir)
2127 {
2128    assert(ir->condition == NULL); /* FINISHME */
2129
2130    /* We track our discarded pixels in f0.1.  By predicating on it, we can
2131     * update just the flag bits that aren't yet discarded.  By emitting a
2132     * CMP of g0 != g0, all our currently executing channels will get turned
2133     * off.
2134     */
2135    fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2136                                    BRW_REGISTER_TYPE_UW));
2137    fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
2138                            BRW_CONDITIONAL_NZ));
2139    cmp->predicate = BRW_PREDICATE_NORMAL;
2140    cmp->flag_subreg = 1;
2141
2142    if (brw->gen >= 6) {
2143       /* For performance, after a discard, jump to the end of the shader.
2144        * Only jump if all relevant channels have been discarded.
2145        */
2146       fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
2147       discard_jump->flag_subreg = 1;
2148
2149       discard_jump->predicate = (dispatch_width == 8)
2150                                 ? BRW_PREDICATE_ALIGN1_ANY8H
2151                                 : BRW_PREDICATE_ALIGN1_ANY16H;
2152       discard_jump->predicate_inverse = true;
2153    }
2154 }
2155
2156 void
2157 fs_visitor::visit(ir_constant *ir)
2158 {
2159    /* Set this->result to reg at the bottom of the function because some code
2160     * paths will cause this visitor to be applied to other fields.  This will
2161     * cause the value stored in this->result to be modified.
2162     *
2163     * Make reg constant so that it doesn't get accidentally modified along the
2164     * way.  Yes, I actually had this problem. :(
2165     */
2166    const fs_reg reg(this, ir->type);
2167    fs_reg dst_reg = reg;
2168
2169    if (ir->type->is_array()) {
2170       const unsigned size = type_size(ir->type->fields.array);
2171
2172       for (unsigned i = 0; i < ir->type->length; i++) {
2173          ir->array_elements[i]->accept(this);
2174          fs_reg src_reg = this->result;
2175
2176          dst_reg.type = src_reg.type;
2177          for (unsigned j = 0; j < size; j++) {
2178             emit(MOV(dst_reg, src_reg));
2179             src_reg.reg_offset++;
2180             dst_reg.reg_offset++;
2181          }
2182       }
2183    } else if (ir->type->is_record()) {
2184       foreach_in_list(ir_constant, field, &ir->components) {
2185          const unsigned size = type_size(field->type);
2186
2187          field->accept(this);
2188          fs_reg src_reg = this->result;
2189
2190          dst_reg.type = src_reg.type;
2191          for (unsigned j = 0; j < size; j++) {
2192             emit(MOV(dst_reg, src_reg));
2193             src_reg.reg_offset++;
2194             dst_reg.reg_offset++;
2195          }
2196       }
2197    } else {
2198       const unsigned size = type_size(ir->type);
2199
2200       for (unsigned i = 0; i < size; i++) {
2201          switch (ir->type->base_type) {
2202          case GLSL_TYPE_FLOAT:
2203             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
2204             break;
2205          case GLSL_TYPE_UINT:
2206             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
2207             break;
2208          case GLSL_TYPE_INT:
2209             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
2210             break;
2211          case GLSL_TYPE_BOOL:
2212             emit(MOV(dst_reg, fs_reg((int)ir->value.b[i])));
2213             break;
2214          default:
2215             unreachable("Non-float/uint/int/bool constant");
2216          }
2217          dst_reg.reg_offset++;
2218       }
2219    }
2220
2221    this->result = reg;
2222 }
2223
2224 void
2225 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
2226 {
2227    ir_expression *expr = ir->as_expression();
2228
2229    if (expr &&
2230        expr->operation != ir_binop_logic_and &&
2231        expr->operation != ir_binop_logic_or &&
2232        expr->operation != ir_binop_logic_xor) {
2233       fs_reg op[2];
2234       fs_inst *inst;
2235
2236       assert(expr->get_num_operands() <= 2);
2237       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2238          assert(expr->operands[i]->type->is_scalar());
2239
2240          expr->operands[i]->accept(this);
2241          op[i] = this->result;
2242
2243          resolve_ud_negate(&op[i]);
2244       }
2245
2246       switch (expr->operation) {
2247       case ir_unop_logic_not:
2248          inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
2249          inst->conditional_mod = BRW_CONDITIONAL_Z;
2250          break;
2251
2252       case ir_unop_f2b:
2253          if (brw->gen >= 6) {
2254             emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
2255          } else {
2256             inst = emit(MOV(reg_null_f, op[0]));
2257             inst->conditional_mod = BRW_CONDITIONAL_NZ;
2258          }
2259          break;
2260
2261       case ir_unop_i2b:
2262          if (brw->gen >= 6) {
2263             emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2264          } else {
2265             inst = emit(MOV(reg_null_d, op[0]));
2266             inst->conditional_mod = BRW_CONDITIONAL_NZ;
2267          }
2268          break;
2269
2270       case ir_binop_greater:
2271       case ir_binop_gequal:
2272       case ir_binop_less:
2273       case ir_binop_lequal:
2274       case ir_binop_equal:
2275       case ir_binop_all_equal:
2276       case ir_binop_nequal:
2277       case ir_binop_any_nequal:
2278          resolve_bool_comparison(expr->operands[0], &op[0]);
2279          resolve_bool_comparison(expr->operands[1], &op[1]);
2280
2281          emit(CMP(reg_null_d, op[0], op[1],
2282                   brw_conditional_for_comparison(expr->operation)));
2283          break;
2284
2285       default:
2286          unreachable("not reached");
2287       }
2288       return;
2289    }
2290
2291    ir->accept(this);
2292
2293    fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
2294    inst->conditional_mod = BRW_CONDITIONAL_NZ;
2295 }
2296
2297 /**
2298  * Emit a gen6 IF statement with the comparison folded into the IF
2299  * instruction.
2300  */
2301 void
2302 fs_visitor::emit_if_gen6(ir_if *ir)
2303 {
2304    ir_expression *expr = ir->condition->as_expression();
2305
2306    if (expr) {
2307       fs_reg op[2];
2308       fs_inst *inst;
2309       fs_reg temp;
2310
2311       assert(expr->get_num_operands() <= 2);
2312       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2313          assert(expr->operands[i]->type->is_scalar());
2314
2315          expr->operands[i]->accept(this);
2316          op[i] = this->result;
2317       }
2318
2319       switch (expr->operation) {
2320       case ir_unop_logic_not:
2321       case ir_binop_logic_xor:
2322       case ir_binop_logic_or:
2323       case ir_binop_logic_and:
2324          /* For operations on bool arguments, only the low bit of the bool is
2325           * valid, and the others are undefined.  Fall back to the condition
2326           * code path.
2327           */
2328          break;
2329
2330       case ir_unop_f2b:
2331          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2332          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2333          return;
2334
2335       case ir_unop_i2b:
2336          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2337          return;
2338
2339       case ir_binop_greater:
2340       case ir_binop_gequal:
2341       case ir_binop_less:
2342       case ir_binop_lequal:
2343       case ir_binop_equal:
2344       case ir_binop_all_equal:
2345       case ir_binop_nequal:
2346       case ir_binop_any_nequal:
2347          resolve_bool_comparison(expr->operands[0], &op[0]);
2348          resolve_bool_comparison(expr->operands[1], &op[1]);
2349
2350          emit(IF(op[0], op[1],
2351                  brw_conditional_for_comparison(expr->operation)));
2352          return;
2353       default:
2354          unreachable("not reached");
2355       }
2356    }
2357
2358    emit_bool_to_cond_code(ir->condition);
2359    fs_inst *inst = emit(BRW_OPCODE_IF);
2360    inst->predicate = BRW_PREDICATE_NORMAL;
2361 }
2362
2363 /**
2364  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2365  *
2366  * Many GLSL shaders contain the following pattern:
2367  *
2368  *    x = condition ? foo : bar
2369  *
2370  * The compiler emits an ir_if tree for this, since each subexpression might be
2371  * a complex tree that could have side-effects or short-circuit logic.
2372  *
2373  * However, the common case is to simply select one of two constants or
2374  * variable values---which is exactly what SEL is for.  In this case, the
2375  * assembly looks like:
2376  *
2377  *    (+f0) IF
2378  *    MOV dst src0
2379  *    ELSE
2380  *    MOV dst src1
2381  *    ENDIF
2382  *
2383  * which can be easily translated into:
2384  *
2385  *    (+f0) SEL dst src0 src1
2386  *
2387  * If src0 is an immediate value, we promote it to a temporary GRF.
2388  */
2389 void
2390 fs_visitor::try_replace_with_sel()
2391 {
2392    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2393    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2394
2395    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2396    int opcodes[] = {
2397       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
2398    };
2399
2400    fs_inst *match = (fs_inst *) endif_inst->prev;
2401    for (int i = 0; i < 4; i++) {
2402       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
2403          return;
2404       match = (fs_inst *) match->prev;
2405    }
2406
2407    /* The opcodes match; it looks like the right sequence of instructions. */
2408    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
2409    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
2410    fs_inst *if_inst = (fs_inst *) then_mov->prev;
2411
2412    /* Check that the MOVs are the right form. */
2413    if (then_mov->dst.equals(else_mov->dst) &&
2414        !then_mov->is_partial_write() &&
2415        !else_mov->is_partial_write()) {
2416
2417       /* Remove the matched instructions; we'll emit a SEL to replace them. */
2418       while (!if_inst->next->is_tail_sentinel())
2419          if_inst->next->remove();
2420       if_inst->remove();
2421
2422       /* Only the last source register can be a constant, so if the MOV in
2423        * the "then" clause uses a constant, we need to put it in a temporary.
2424        */
2425       fs_reg src0(then_mov->src[0]);
2426       if (src0.file == IMM) {
2427          src0 = fs_reg(this, glsl_type::float_type);
2428          src0.type = then_mov->src[0].type;
2429          emit(MOV(src0, then_mov->src[0]));
2430       }
2431
2432       fs_inst *sel;
2433       if (if_inst->conditional_mod) {
2434          /* Sandybridge-specific IF with embedded comparison */
2435          emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
2436                   if_inst->conditional_mod));
2437          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2438          sel->predicate = BRW_PREDICATE_NORMAL;
2439       } else {
2440          /* Separate CMP and IF instructions */
2441          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2442          sel->predicate = if_inst->predicate;
2443          sel->predicate_inverse = if_inst->predicate_inverse;
2444       }
2445    }
2446 }
2447
2448 void
2449 fs_visitor::visit(ir_if *ir)
2450 {
2451    if (brw->gen < 6) {
2452       no16("Can't support (non-uniform) control flow on SIMD16\n");
2453    }
2454
2455    /* Don't point the annotation at the if statement, because then it plus
2456     * the then and else blocks get printed.
2457     */
2458    this->base_ir = ir->condition;
2459
2460    if (brw->gen == 6) {
2461       emit_if_gen6(ir);
2462    } else {
2463       emit_bool_to_cond_code(ir->condition);
2464
2465       emit(IF(BRW_PREDICATE_NORMAL));
2466    }
2467
2468    foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
2469       this->base_ir = ir_;
2470       ir_->accept(this);
2471    }
2472
2473    if (!ir->else_instructions.is_empty()) {
2474       emit(BRW_OPCODE_ELSE);
2475
2476       foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
2477          this->base_ir = ir_;
2478          ir_->accept(this);
2479       }
2480    }
2481
2482    emit(BRW_OPCODE_ENDIF);
2483
2484    try_replace_with_sel();
2485 }
2486
2487 void
2488 fs_visitor::visit(ir_loop *ir)
2489 {
2490    if (brw->gen < 6) {
2491       no16("Can't support (non-uniform) control flow on SIMD16\n");
2492    }
2493
2494    this->base_ir = NULL;
2495    emit(BRW_OPCODE_DO);
2496
2497    foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
2498       this->base_ir = ir_;
2499       ir_->accept(this);
2500    }
2501
2502    this->base_ir = NULL;
2503    emit(BRW_OPCODE_WHILE);
2504 }
2505
2506 void
2507 fs_visitor::visit(ir_loop_jump *ir)
2508 {
2509    switch (ir->mode) {
2510    case ir_loop_jump::jump_break:
2511       emit(BRW_OPCODE_BREAK);
2512       break;
2513    case ir_loop_jump::jump_continue:
2514       emit(BRW_OPCODE_CONTINUE);
2515       break;
2516    }
2517 }
2518
2519 void
2520 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2521 {
2522    ir_dereference *deref = static_cast<ir_dereference *>(
2523       ir->actual_parameters.get_head());
2524    ir_variable *location = deref->variable_referenced();
2525    unsigned surf_index = (prog_data->base.binding_table.abo_start +
2526                           location->data.atomic.buffer_index);
2527
2528    /* Calculate the surface offset */
2529    fs_reg offset(this, glsl_type::uint_type);
2530    ir_dereference_array *deref_array = deref->as_dereference_array();
2531
2532    if (deref_array) {
2533       deref_array->array_index->accept(this);
2534
2535       fs_reg tmp(this, glsl_type::uint_type);
2536       emit(MUL(tmp, this->result, ATOMIC_COUNTER_SIZE));
2537       emit(ADD(offset, tmp, location->data.atomic.offset));
2538    } else {
2539       offset = location->data.atomic.offset;
2540    }
2541
2542    /* Emit the appropriate machine instruction */
2543    const char *callee = ir->callee->function_name();
2544    ir->return_deref->accept(this);
2545    fs_reg dst = this->result;
2546
2547    if (!strcmp("__intrinsic_atomic_read", callee)) {
2548       emit_untyped_surface_read(surf_index, dst, offset);
2549
2550    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2551       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2552                           fs_reg(), fs_reg());
2553
2554    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2555       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2556                           fs_reg(), fs_reg());
2557    }
2558 }
2559
2560 void
2561 fs_visitor::visit(ir_call *ir)
2562 {
2563    const char *callee = ir->callee->function_name();
2564
2565    if (!strcmp("__intrinsic_atomic_read", callee) ||
2566        !strcmp("__intrinsic_atomic_increment", callee) ||
2567        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2568       visit_atomic_counter_intrinsic(ir);
2569    } else {
2570       unreachable("Unsupported intrinsic.");
2571    }
2572 }
2573
2574 void
2575 fs_visitor::visit(ir_return *)
2576 {
2577    unreachable("FINISHME");
2578 }
2579
2580 void
2581 fs_visitor::visit(ir_function *ir)
2582 {
2583    /* Ignore function bodies other than main() -- we shouldn't see calls to
2584     * them since they should all be inlined before we get to ir_to_mesa.
2585     */
2586    if (strcmp(ir->name, "main") == 0) {
2587       const ir_function_signature *sig;
2588       exec_list empty;
2589
2590       sig = ir->matching_signature(NULL, &empty, false);
2591
2592       assert(sig);
2593
2594       foreach_in_list(ir_instruction, ir_, &sig->body) {
2595          this->base_ir = ir_;
2596          ir_->accept(this);
2597       }
2598    }
2599 }
2600
2601 void
2602 fs_visitor::visit(ir_function_signature *)
2603 {
2604    unreachable("not reached");
2605 }
2606
2607 void
2608 fs_visitor::visit(ir_emit_vertex *)
2609 {
2610    unreachable("not reached");
2611 }
2612
2613 void
2614 fs_visitor::visit(ir_end_primitive *)
2615 {
2616    unreachable("not reached");
2617 }
2618
2619 void
2620 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2621                                 fs_reg dst, fs_reg offset, fs_reg src0,
2622                                 fs_reg src1)
2623 {
2624    const unsigned operand_len = dispatch_width / 8;
2625    unsigned mlen = 0;
2626
2627    /* Initialize the sample mask in the message header. */
2628    emit(MOV(brw_uvec_mrf(8, mlen, 0), fs_reg(0u)))
2629       ->force_writemask_all = true;
2630
2631    if (fp->UsesKill) {
2632       emit(MOV(brw_uvec_mrf(1, mlen, 7), brw_flag_reg(0, 1)))
2633          ->force_writemask_all = true;
2634    } else {
2635       emit(MOV(brw_uvec_mrf(1, mlen, 7),
2636                retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2637          ->force_writemask_all = true;
2638    }
2639
2640    mlen++;
2641
2642    /* Set the atomic operation offset. */
2643    emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), offset));
2644    mlen += operand_len;
2645
2646    /* Set the atomic operation arguments. */
2647    if (src0.file != BAD_FILE) {
2648       emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), src0));
2649       mlen += operand_len;
2650    }
2651
2652    if (src1.file != BAD_FILE) {
2653       emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), src1));
2654       mlen += operand_len;
2655    }
2656
2657    /* Emit the instruction. */
2658    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2659                                         atomic_op, surf_index);
2660    inst->base_mrf = 0;
2661    inst->mlen = mlen;
2662    inst->header_present = true;
2663    emit(inst);
2664 }
2665
2666 void
2667 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
2668                                       fs_reg offset)
2669 {
2670    const unsigned operand_len = dispatch_width / 8;
2671    unsigned mlen = 0;
2672
2673    /* Initialize the sample mask in the message header. */
2674    emit(MOV(brw_uvec_mrf(8, mlen, 0), fs_reg(0u)))
2675       ->force_writemask_all = true;
2676
2677    if (fp->UsesKill) {
2678       emit(MOV(brw_uvec_mrf(1, mlen, 7), brw_flag_reg(0, 1)))
2679          ->force_writemask_all = true;
2680    } else {
2681       emit(MOV(brw_uvec_mrf(1, mlen, 7),
2682                retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2683          ->force_writemask_all = true;
2684    }
2685
2686    mlen++;
2687
2688    /* Set the surface read offset. */
2689    emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), offset));
2690    mlen += operand_len;
2691
2692    /* Emit the instruction. */
2693    fs_inst *inst = new(mem_ctx)
2694       fs_inst(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, surf_index);
2695    inst->base_mrf = 0;
2696    inst->mlen = mlen;
2697    inst->header_present = true;
2698    emit(inst);
2699 }
2700
2701 fs_inst *
2702 fs_visitor::emit(fs_inst *inst)
2703 {
2704    if (force_uncompressed_stack > 0)
2705       inst->force_uncompressed = true;
2706
2707    inst->annotation = this->current_annotation;
2708    inst->ir = this->base_ir;
2709
2710    this->instructions.push_tail(inst);
2711
2712    return inst;
2713 }
2714
2715 void
2716 fs_visitor::emit(exec_list list)
2717 {
2718    foreach_in_list_safe(fs_inst, inst, &list) {
2719       inst->remove();
2720       emit(inst);
2721    }
2722 }
2723
2724 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
2725 void
2726 fs_visitor::emit_dummy_fs()
2727 {
2728    int reg_width = dispatch_width / 8;
2729
2730    /* Everyone's favorite color. */
2731    emit(MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f)));
2732    emit(MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f)));
2733    emit(MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f)));
2734    emit(MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f)));
2735
2736    fs_inst *write;
2737    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
2738    write->base_mrf = 2;
2739    write->mlen = 4 * reg_width;
2740    write->eot = true;
2741 }
2742
2743 /* The register location here is relative to the start of the URB
2744  * data.  It will get adjusted to be a real location before
2745  * generate_code() time.
2746  */
2747 struct brw_reg
2748 fs_visitor::interp_reg(int location, int channel)
2749 {
2750    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
2751    int stride = (channel & 1) * 4;
2752
2753    assert(prog_data->urb_setup[location] != -1);
2754
2755    return brw_vec1_grf(regnr, stride);
2756 }
2757
2758 /** Emits the interpolation for the varying inputs. */
2759 void
2760 fs_visitor::emit_interpolation_setup_gen4()
2761 {
2762    this->current_annotation = "compute pixel centers";
2763    this->pixel_x = fs_reg(this, glsl_type::uint_type);
2764    this->pixel_y = fs_reg(this, glsl_type::uint_type);
2765    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2766    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2767
2768    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2769    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2770
2771    this->current_annotation = "compute pixel deltas from v0";
2772    if (brw->has_pln) {
2773       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2774          fs_reg(this, glsl_type::vec2_type);
2775       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2776          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
2777       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
2778    } else {
2779       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2780          fs_reg(this, glsl_type::float_type);
2781       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2782          fs_reg(this, glsl_type::float_type);
2783    }
2784    emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2785             this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
2786    emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2787             this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
2788
2789    this->current_annotation = "compute pos.w and 1/pos.w";
2790    /* Compute wpos.w.  It's always in our setup, since it's needed to
2791     * interpolate the other attributes.
2792     */
2793    this->wpos_w = fs_reg(this, glsl_type::float_type);
2794    emit(FS_OPCODE_LINTERP, wpos_w,
2795         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2796         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2797         interp_reg(VARYING_SLOT_POS, 3));
2798    /* Compute the pixel 1/W value from wpos.w. */
2799    this->pixel_w = fs_reg(this, glsl_type::float_type);
2800    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
2801    this->current_annotation = NULL;
2802 }
2803
2804 /** Emits the interpolation for the varying inputs. */
2805 void
2806 fs_visitor::emit_interpolation_setup_gen6()
2807 {
2808    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2809
2810    /* If the pixel centers end up used, the setup is the same as for gen4. */
2811    this->current_annotation = "compute pixel centers";
2812    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2813    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2814    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2815    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2816    emit(ADD(int_pixel_x,
2817             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2818             fs_reg(brw_imm_v(0x10101010))));
2819    emit(ADD(int_pixel_y,
2820             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2821             fs_reg(brw_imm_v(0x11001100))));
2822
2823    /* As of gen6, we can no longer mix float and int sources.  We have
2824     * to turn the integer pixel centers into floats for their actual
2825     * use.
2826     */
2827    this->pixel_x = fs_reg(this, glsl_type::float_type);
2828    this->pixel_y = fs_reg(this, glsl_type::float_type);
2829    emit(MOV(this->pixel_x, int_pixel_x));
2830    emit(MOV(this->pixel_y, int_pixel_y));
2831
2832    this->current_annotation = "compute pos.w";
2833    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
2834    this->wpos_w = fs_reg(this, glsl_type::float_type);
2835    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
2836
2837    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2838       uint8_t reg = payload.barycentric_coord_reg[i];
2839       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
2840       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
2841    }
2842
2843    this->current_annotation = NULL;
2844 }
2845
2846 void
2847 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
2848 {
2849    int reg_width = dispatch_width / 8;
2850    fs_inst *inst;
2851    fs_reg color = outputs[target];
2852    fs_reg mrf;
2853
2854    /* If there's no color data to be written, skip it. */
2855    if (color.file == BAD_FILE)
2856       return;
2857
2858    color.reg_offset += index;
2859
2860    if (dispatch_width == 8 || brw->gen >= 6) {
2861       /* SIMD8 write looks like:
2862        * m + 0: r0
2863        * m + 1: r1
2864        * m + 2: g0
2865        * m + 3: g1
2866        *
2867        * gen6 SIMD16 DP write looks like:
2868        * m + 0: r0
2869        * m + 1: r1
2870        * m + 2: g0
2871        * m + 3: g1
2872        * m + 4: b0
2873        * m + 5: b1
2874        * m + 6: a0
2875        * m + 7: a1
2876        */
2877       inst = emit(MOV(fs_reg(MRF, first_color_mrf + index * reg_width,
2878                              color.type),
2879                       color));
2880       inst->saturate = key->clamp_fragment_color;
2881    } else {
2882       /* pre-gen6 SIMD16 single source DP write looks like:
2883        * m + 0: r0
2884        * m + 1: g0
2885        * m + 2: b0
2886        * m + 3: a0
2887        * m + 4: r1
2888        * m + 5: g1
2889        * m + 6: b1
2890        * m + 7: a1
2891        */
2892       if (brw->has_compr4) {
2893          /* By setting the high bit of the MRF register number, we
2894           * indicate that we want COMPR4 mode - instead of doing the
2895           * usual destination + 1 for the second half we get
2896           * destination + 4.
2897           */
2898          inst = emit(MOV(fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
2899                                 color.type),
2900                          color));
2901          inst->saturate = key->clamp_fragment_color;
2902       } else {
2903          push_force_uncompressed();
2904          inst = emit(MOV(fs_reg(MRF, first_color_mrf + index, color.type),
2905                          color));
2906          inst->saturate = key->clamp_fragment_color;
2907          pop_force_uncompressed();
2908
2909          inst = emit(MOV(fs_reg(MRF, first_color_mrf + index + 4, color.type),
2910                          half(color, 1)));
2911          inst->force_sechalf = true;
2912          inst->saturate = key->clamp_fragment_color;
2913       }
2914    }
2915 }
2916
2917 static enum brw_conditional_mod
2918 cond_for_alpha_func(GLenum func)
2919 {
2920    switch(func) {
2921       case GL_GREATER:
2922          return BRW_CONDITIONAL_G;
2923       case GL_GEQUAL:
2924          return BRW_CONDITIONAL_GE;
2925       case GL_LESS:
2926          return BRW_CONDITIONAL_L;
2927       case GL_LEQUAL:
2928          return BRW_CONDITIONAL_LE;
2929       case GL_EQUAL:
2930          return BRW_CONDITIONAL_EQ;
2931       case GL_NOTEQUAL:
2932          return BRW_CONDITIONAL_NEQ;
2933       default:
2934          unreachable("Not reached");
2935    }
2936 }
2937
2938 /**
2939  * Alpha test support for when we compile it into the shader instead
2940  * of using the normal fixed-function alpha test.
2941  */
2942 void
2943 fs_visitor::emit_alpha_test()
2944 {
2945    this->current_annotation = "Alpha test";
2946
2947    fs_inst *cmp;
2948    if (key->alpha_test_func == GL_ALWAYS)
2949       return;
2950
2951    if (key->alpha_test_func == GL_NEVER) {
2952       /* f0.1 = 0 */
2953       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2954                                       BRW_REGISTER_TYPE_UW));
2955       cmp = emit(CMP(reg_null_f, some_reg, some_reg,
2956                      BRW_CONDITIONAL_NEQ));
2957    } else {
2958       /* RT0 alpha */
2959       fs_reg color = outputs[0];
2960       color.reg_offset += 3;
2961
2962       /* f0.1 &= func(color, ref) */
2963       cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
2964                      cond_for_alpha_func(key->alpha_test_func)));
2965    }
2966    cmp->predicate = BRW_PREDICATE_NORMAL;
2967    cmp->flag_subreg = 1;
2968 }
2969
2970 void
2971 fs_visitor::emit_fb_writes()
2972 {
2973    this->current_annotation = "FB write header";
2974    bool header_present = true;
2975    /* We can potentially have a message length of up to 15, so we have to set
2976     * base_mrf to either 0 or 1 in order to fit in m0..m15.
2977     */
2978    int base_mrf = 1;
2979    int nr = base_mrf;
2980    int reg_width = dispatch_width / 8;
2981    bool src0_alpha_to_render_target = false;
2982
2983    if (do_dual_src) {
2984       no16("GL_ARB_blend_func_extended not yet supported in SIMD16.");
2985       if (dispatch_width == 16)
2986          do_dual_src = false;
2987    }
2988
2989    /* From the Sandy Bridge PRM, volume 4, page 198:
2990     *
2991     *     "Dispatched Pixel Enables. One bit per pixel indicating
2992     *      which pixels were originally enabled when the thread was
2993     *      dispatched. This field is only required for the end-of-
2994     *      thread message and on all dual-source messages."
2995     */
2996    if (brw->gen >= 6 &&
2997        (brw->is_haswell || brw->gen >= 8 || !this->fp->UsesKill) &&
2998        !do_dual_src &&
2999        key->nr_color_regions == 1) {
3000       header_present = false;
3001    }
3002
3003    if (header_present) {
3004       src0_alpha_to_render_target = brw->gen >= 6 &&
3005                                     !do_dual_src &&
3006                                     key->replicate_alpha;
3007       /* m2, m3 header */
3008       nr += 2;
3009    }
3010
3011    if (payload.aa_dest_stencil_reg) {
3012       push_force_uncompressed();
3013       emit(MOV(fs_reg(MRF, nr++),
3014                fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
3015       pop_force_uncompressed();
3016    }
3017
3018    prog_data->uses_omask =
3019       fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
3020    if (prog_data->uses_omask) {
3021       this->current_annotation = "FB write oMask";
3022       assert(this->sample_mask.file != BAD_FILE);
3023       /* Hand over gl_SampleMask. Only lower 16 bits are relevant. */
3024       emit(FS_OPCODE_SET_OMASK, fs_reg(MRF, nr, BRW_REGISTER_TYPE_UW), this->sample_mask);
3025       nr += 1;
3026    }
3027
3028    /* Reserve space for color. It'll be filled in per MRT below. */
3029    int color_mrf = nr;
3030    nr += 4 * reg_width;
3031    if (do_dual_src)
3032       nr += 4;
3033    if (src0_alpha_to_render_target)
3034       nr += reg_width;
3035
3036    if (source_depth_to_render_target) {
3037       if (brw->gen == 6) {
3038          /* For outputting oDepth on gen6, SIMD8 writes have to be
3039           * used.  This would require SIMD8 moves of each half to
3040           * message regs, kind of like pre-gen5 SIMD16 FB writes.
3041           * Just bail on doing so for now.
3042           */
3043          no16("Missing support for simd16 depth writes on gen6\n");
3044       }
3045
3046       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3047          /* Hand over gl_FragDepth. */
3048          assert(this->frag_depth.file != BAD_FILE);
3049          emit(MOV(fs_reg(MRF, nr), this->frag_depth));
3050       } else {
3051          /* Pass through the payload depth. */
3052          emit(MOV(fs_reg(MRF, nr),
3053                   fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
3054       }
3055       nr += reg_width;
3056    }
3057
3058    if (payload.dest_depth_reg) {
3059       emit(MOV(fs_reg(MRF, nr),
3060                fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
3061       nr += reg_width;
3062    }
3063
3064    if (do_dual_src) {
3065       fs_reg src0 = this->outputs[0];
3066       fs_reg src1 = this->dual_src_output;
3067
3068       this->current_annotation = ralloc_asprintf(this->mem_ctx,
3069                                                  "FB write src0");
3070       for (int i = 0; i < 4; i++) {
3071          fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + i, src0.type), src0));
3072          src0.reg_offset++;
3073          inst->saturate = key->clamp_fragment_color;
3074       }
3075
3076       this->current_annotation = ralloc_asprintf(this->mem_ctx,
3077                                                  "FB write src1");
3078       for (int i = 0; i < 4; i++) {
3079          fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + 4 + i, src1.type),
3080                                   src1));
3081          src1.reg_offset++;
3082          inst->saturate = key->clamp_fragment_color;
3083       }
3084
3085       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3086          emit_shader_time_end();
3087
3088       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
3089       inst->target = 0;
3090       inst->base_mrf = base_mrf;
3091       inst->mlen = nr - base_mrf;
3092       inst->eot = true;
3093       inst->header_present = header_present;
3094       if ((brw->gen >= 8 || brw->is_haswell) && fp->UsesKill) {
3095          inst->predicate = BRW_PREDICATE_NORMAL;
3096          inst->flag_subreg = 1;
3097       }
3098
3099       prog_data->dual_src_blend = true;
3100       this->current_annotation = NULL;
3101       return;
3102    }
3103
3104    for (int target = 0; target < key->nr_color_regions; target++) {
3105       this->current_annotation = ralloc_asprintf(this->mem_ctx,
3106                                                  "FB write target %d",
3107                                                  target);
3108       /* If src0_alpha_to_render_target is true, include source zero alpha
3109        * data in RenderTargetWrite message for targets > 0.
3110        */
3111       int write_color_mrf = color_mrf;
3112       if (src0_alpha_to_render_target && target != 0) {
3113          fs_inst *inst;
3114          fs_reg color = outputs[0];
3115          color.reg_offset += 3;
3116
3117          inst = emit(MOV(fs_reg(MRF, write_color_mrf, color.type),
3118                          color));
3119          inst->saturate = key->clamp_fragment_color;
3120          write_color_mrf = color_mrf + reg_width;
3121       }
3122
3123       for (unsigned i = 0; i < this->output_components[target]; i++)
3124          emit_color_write(target, i, write_color_mrf);
3125
3126       bool eot = false;
3127       if (target == key->nr_color_regions - 1) {
3128          eot = true;
3129
3130          if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3131             emit_shader_time_end();
3132       }
3133
3134       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
3135       inst->target = target;
3136       inst->base_mrf = base_mrf;
3137       if (src0_alpha_to_render_target && target == 0)
3138          inst->mlen = nr - base_mrf - reg_width;
3139       else
3140          inst->mlen = nr - base_mrf;
3141       inst->eot = eot;
3142       inst->header_present = header_present;
3143       if ((brw->gen >= 8 || brw->is_haswell) && fp->UsesKill) {
3144          inst->predicate = BRW_PREDICATE_NORMAL;
3145          inst->flag_subreg = 1;
3146       }
3147    }
3148
3149    if (key->nr_color_regions == 0) {
3150       /* Even if there's no color buffers enabled, we still need to send
3151        * alpha out the pipeline to our null renderbuffer to support
3152        * alpha-testing, alpha-to-coverage, and so on.
3153        */
3154       emit_color_write(0, 3, color_mrf);
3155
3156       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3157          emit_shader_time_end();
3158
3159       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
3160       inst->base_mrf = base_mrf;
3161       inst->mlen = nr - base_mrf;
3162       inst->eot = true;
3163       inst->header_present = header_present;
3164       if ((brw->gen >= 8 || brw->is_haswell) && fp->UsesKill) {
3165          inst->predicate = BRW_PREDICATE_NORMAL;
3166          inst->flag_subreg = 1;
3167       }
3168    }
3169
3170    this->current_annotation = NULL;
3171 }
3172
3173 void
3174 fs_visitor::resolve_ud_negate(fs_reg *reg)
3175 {
3176    if (reg->type != BRW_REGISTER_TYPE_UD ||
3177        !reg->negate)
3178       return;
3179
3180    fs_reg temp = fs_reg(this, glsl_type::uint_type);
3181    emit(MOV(temp, *reg));
3182    *reg = temp;
3183 }
3184
3185 void
3186 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
3187 {
3188    if (rvalue->type != glsl_type::bool_type)
3189       return;
3190
3191    fs_reg temp = fs_reg(this, glsl_type::bool_type);
3192    emit(AND(temp, *reg, fs_reg(1)));
3193    *reg = temp;
3194 }
3195
3196 fs_visitor::fs_visitor(struct brw_context *brw,
3197                        void *mem_ctx,
3198                        const struct brw_wm_prog_key *key,
3199                        struct brw_wm_prog_data *prog_data,
3200                        struct gl_shader_program *shader_prog,
3201                        struct gl_fragment_program *fp,
3202                        unsigned dispatch_width)
3203    : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
3204                      MESA_SHADER_FRAGMENT),
3205      key(key), prog_data(prog_data),
3206      dispatch_width(dispatch_width)
3207 {
3208    this->fp = fp;
3209    this->mem_ctx = mem_ctx;
3210    this->failed = false;
3211    this->simd16_unsupported = false;
3212    this->no16_msg = NULL;
3213    this->variable_ht = hash_table_ctor(0,
3214                                        hash_table_pointer_hash,
3215                                        hash_table_pointer_compare);
3216
3217    memset(&this->payload, 0, sizeof(this->payload));
3218    memset(this->outputs, 0, sizeof(this->outputs));
3219    memset(this->output_components, 0, sizeof(this->output_components));
3220    this->source_depth_to_render_target = false;
3221    this->runtime_check_aads_emit = false;
3222    this->first_non_payload_grf = 0;
3223    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3224
3225    this->current_annotation = NULL;
3226    this->base_ir = NULL;
3227
3228    this->virtual_grf_sizes = NULL;
3229    this->virtual_grf_count = 0;
3230    this->virtual_grf_array_size = 0;
3231    this->virtual_grf_start = NULL;
3232    this->virtual_grf_end = NULL;
3233    this->live_intervals = NULL;
3234    this->regs_live_at_ip = NULL;
3235
3236    this->uniforms = 0;
3237    this->last_scratch = 0;
3238    this->pull_constant_loc = NULL;
3239    this->push_constant_loc = NULL;
3240
3241    this->force_uncompressed_stack = 0;
3242
3243    this->spilled_any_registers = false;
3244    this->do_dual_src = false;
3245
3246    if (dispatch_width == 8)
3247       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
3248 }
3249
3250 fs_visitor::~fs_visitor()
3251 {
3252    hash_table_dtor(this->variable_ht);
3253 }