src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "program/prog_parameter.h"
  37 #include "program/prog_print.h"
  38 #include "program/prog_optimize.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 }
  45 #include "brw_vec4.h"
  46 #include "brw_fs.h"
  47 #include "main/uniforms.h"
  48 #include "glsl/glsl_types.h"
  49 #include "glsl/ir_optimization.h"
  50 #include "program/sampler.h"
  51
  52
  53 fs_reg *
  54 fs_visitor::emit_vs_system_value(int location)
  55 {
  56    fs_reg *reg = new(this->mem_ctx)
  57       fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
  58    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  59
  60    switch (location) {
  61    case SYSTEM_VALUE_BASE_VERTEX:
  62       reg->reg_offset = 0;
  63       vs_prog_data->uses_vertexid = true;
  64       break;
  65    case SYSTEM_VALUE_VERTEX_ID:
  66    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  67       reg->reg_offset = 2;
  68       vs_prog_data->uses_vertexid = true;
  69       break;
  70    case SYSTEM_VALUE_INSTANCE_ID:
  71       reg->reg_offset = 3;
  72       vs_prog_data->uses_instanceid = true;
  73       break;
  74    default:
  75       unreachable("not reached");
  76    }
  77
  78    return reg;
  79 }
  80
  81 void
  82 fs_visitor::visit(ir_variable *ir)
  83 {
  84    fs_reg *reg = NULL;
  85
  86    if (variable_storage(ir))
  87       return;
  88
  89    if (ir->data.mode == ir_var_shader_in) {
  90       assert(ir->data.location != -1);
  91       if (stage == MESA_SHADER_VERTEX) {
  92          reg = new(this->mem_ctx)
  93             fs_reg(ATTR, ir->data.location,
  94                    brw_type_for_base_type(ir->type->get_scalar_type()));
  95       } else if (ir->data.location == VARYING_SLOT_POS) {
  96          reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
  97                                             ir->data.origin_upper_left);
  98       } else if (ir->data.location == VARYING_SLOT_FACE) {
  99          reg = emit_frontfacing_interpolation();
 100       } else {
 101          reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 102          emit_general_interpolation(*reg, ir->name, ir->type,
 103                                     (glsl_interp_qualifier) ir->data.interpolation,
 104                                     ir->data.location, ir->data.centroid,
 105                                     ir->data.sample);
 106       }
 107       assert(reg);
 108       hash_table_insert(this->variable_ht, reg, ir);
 109       return;
 110    } else if (ir->data.mode == ir_var_shader_out) {
 111       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 112
 113       if (stage == MESA_SHADER_VERTEX) {
 114          int vector_elements =
 115             ir->type->is_array() ? ir->type->fields.array->vector_elements
 116                                  : ir->type->vector_elements;
 117
 118          for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
 119             int output = ir->data.location + i;
 120             this->outputs[output] = *reg;
 121             this->outputs[output].reg_offset = i * 4;
 122             this->output_components[output] = vector_elements;
 123          }
 124
 125       } else if (ir->data.index > 0) {
 126          assert(ir->data.location == FRAG_RESULT_DATA0);
 127          assert(ir->data.index == 1);
 128          this->dual_src_output = *reg;
 129          this->do_dual_src = true;
 130       } else if (ir->data.location == FRAG_RESULT_COLOR) {
 131          /* Writing gl_FragColor outputs to all color regions. */
 132          assert(stage == MESA_SHADER_FRAGMENT);
 133          brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 134          for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
 135             this->outputs[i] = *reg;
 136             this->output_components[i] = 4;
 137          }
 138       } else if (ir->data.location == FRAG_RESULT_DEPTH) {
 139          this->frag_depth = *reg;
 140       } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
 141          this->sample_mask = *reg;
 142       } else {
 143          /* gl_FragData or a user-defined FS output */
 144          assert(ir->data.location >= FRAG_RESULT_DATA0 &&
 145                 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
 146
 147          int vector_elements =
 148             ir->type->is_array() ? ir->type->fields.array->vector_elements
 149                                  : ir->type->vector_elements;
 150
 151          /* General color output. */
 152          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
 153             int output = ir->data.location - FRAG_RESULT_DATA0 + i;
 154             this->outputs[output] = offset(*reg, vector_elements * i);
 155             this->output_components[output] = vector_elements;
 156          }
 157       }
 158    } else if (ir->data.mode == ir_var_uniform) {
 159       int param_index = uniforms;
 160
 161       /* Thanks to the lower_ubo_reference pass, we will see only
 162        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 163        * variables, so no need for them to be in variable_ht.
 164        *
 165        * Some uniforms, such as samplers and atomic counters, have no actual
 166        * storage, so we should ignore them.
 167        */
 168       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
 169          return;
 170
 171       if (dispatch_width == 16) {
 172          if (!variable_storage(ir)) {
 173             fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
 174          }
 175          return;
 176       }
 177
 178       param_size[param_index] = type_size(ir->type);
 179       if (!strncmp(ir->name, "gl_", 3)) {
 180          setup_builtin_uniform_values(ir);
 181       } else {
 182          setup_uniform_values(ir);
 183       }
 184
 185       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 186       reg->type = brw_type_for_base_type(ir->type);
 187
 188    } else if (ir->data.mode == ir_var_system_value) {
 189       switch (ir->data.location) {
 190       case SYSTEM_VALUE_BASE_VERTEX:
 191       case SYSTEM_VALUE_VERTEX_ID:
 192       case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
 193       case SYSTEM_VALUE_INSTANCE_ID:
 194          reg = emit_vs_system_value(ir->data.location);
 195          break;
 196       case SYSTEM_VALUE_SAMPLE_POS:
 197          reg = emit_samplepos_setup();
 198          break;
 199       case SYSTEM_VALUE_SAMPLE_ID:
 200          reg = emit_sampleid_setup();
 201          break;
 202       case SYSTEM_VALUE_SAMPLE_MASK_IN:
 203          assert(brw->gen >= 7);
 204          reg = new(mem_ctx)
 205             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
 206                           BRW_REGISTER_TYPE_D));
 207          break;
 208       }
 209    }
 210
 211    if (!reg)
 212       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 213
 214    hash_table_insert(this->variable_ht, reg, ir);
 215 }
 216
 217 void
 218 fs_visitor::visit(ir_dereference_variable *ir)
 219 {
 220    fs_reg *reg = variable_storage(ir->var);
 221
 222    if (!reg) {
 223       fail("Failed to find variable storage for %s\n", ir->var->name);
 224       this->result = fs_reg(reg_null_d);
 225       return;
 226    }
 227    this->result = *reg;
 228 }
 229
 230 void
 231 fs_visitor::visit(ir_dereference_record *ir)
 232 {
 233    const glsl_type *struct_type = ir->record->type;
 234
 235    ir->record->accept(this);
 236
 237    unsigned int off = 0;
 238    for (unsigned int i = 0; i < struct_type->length; i++) {
 239       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 240          break;
 241       off += type_size(struct_type->fields.structure[i].type);
 242    }
 243    this->result = offset(this->result, off);
 244    this->result.type = brw_type_for_base_type(ir->type);
 245 }
 246
 247 void
 248 fs_visitor::visit(ir_dereference_array *ir)
 249 {
 250    ir_constant *constant_index;
 251    fs_reg src;
 252    int element_size = type_size(ir->type);
 253
 254    constant_index = ir->array_index->as_constant();
 255
 256    ir->array->accept(this);
 257    src = this->result;
 258    src.type = brw_type_for_base_type(ir->type);
 259
 260    if (constant_index) {
 261       if (src.file == ATTR) {
 262          /* Attribute arrays get loaded as one vec4 per element.  In that case
 263           * offset the source register.
 264           */
 265          src.reg += constant_index->value.i[0];
 266       } else {
 267          assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
 268          src = offset(src, constant_index->value.i[0] * element_size);
 269       }
 270    } else {
 271       /* Variable index array dereference.  We attach the variable index
 272        * component to the reg as a pointer to a register containing the
 273        * offset.  Currently only uniform arrays are supported in this patch,
 274        * and that reladdr pointer is resolved by
 275        * move_uniform_array_access_to_pull_constants().  All other array types
 276        * are lowered by lower_variable_index_to_cond_assign().
 277        */
 278       ir->array_index->accept(this);
 279
 280       fs_reg index_reg;
 281       index_reg = vgrf(glsl_type::int_type);
 282       emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
 283
 284       if (src.reladdr) {
 285          emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
 286       }
 287
 288       src.reladdr = ralloc(mem_ctx, fs_reg);
 289       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
 290    }
 291    this->result = src;
 292 }
 293
 294 void
 295 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
 296                      const fs_reg &a)
 297 {
 298    if (brw->gen < 6) {
 299       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 300       fs_reg y_times_a           = vgrf(glsl_type::float_type);
 301       fs_reg one_minus_a         = vgrf(glsl_type::float_type);
 302       fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
 303
 304       emit(MUL(y_times_a, y, a));
 305
 306       fs_reg negative_a = a;
 307       negative_a.negate = !a.negate;
 308       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
 309       emit(MUL(x_times_one_minus_a, x, one_minus_a));
 310
 311       emit(ADD(dst, x_times_one_minus_a, y_times_a));
 312    } else {
 313       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 314        * we need to reorder the operands.
 315        */
 316       emit(LRP(dst, a, y, x));
 317    }
 318 }
 319
 320 void
 321 fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
 322                         const fs_reg &src0, const fs_reg &src1)
 323 {
 324    assert(conditionalmod == BRW_CONDITIONAL_GE ||
 325           conditionalmod == BRW_CONDITIONAL_L);
 326
 327    fs_inst *inst;
 328
 329    if (brw->gen >= 6) {
 330       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 331       inst->conditional_mod = conditionalmod;
 332    } else {
 333       emit(CMP(reg_null_d, src0, src1, conditionalmod));
 334
 335       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 336       inst->predicate = BRW_PREDICATE_NORMAL;
 337    }
 338 }
 339
 340 bool
 341 fs_visitor::try_emit_saturate(ir_expression *ir)
 342 {
 343    if (ir->operation != ir_unop_saturate)
 344       return false;
 345
 346    ir_rvalue *sat_val = ir->operands[0];
 347
 348    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 349
 350    sat_val->accept(this);
 351    fs_reg src = this->result;
 352
 353    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 354
 355    /* If the last instruction from our accept() generated our
 356     * src, just set the saturate flag instead of emmitting a separate mov.
 357     */
 358    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 359    if (modify && modify->regs_written == modify->dst.width / 8 &&
 360        modify->can_do_saturate()) {
 361       modify->saturate = true;
 362       this->result = src;
 363       return true;
 364    }
 365
 366    return false;
 367 }
 368
 369 bool
 370 fs_visitor::try_emit_line(ir_expression *ir)
 371 {
 372    /* LINE's src0 must be of type float. */
 373    if (ir->type != glsl_type::float_type)
 374       return false;
 375
 376    ir_rvalue *nonmul = ir->operands[1];
 377    ir_expression *mul = ir->operands[0]->as_expression();
 378
 379    if (!mul || mul->operation != ir_binop_mul) {
 380       nonmul = ir->operands[0];
 381       mul = ir->operands[1]->as_expression();
 382
 383       if (!mul || mul->operation != ir_binop_mul)
 384          return false;
 385    }
 386
 387    ir_constant *const_add = nonmul->as_constant();
 388    if (!const_add)
 389       return false;
 390
 391    int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
 392    if (add_operand_vf == -1)
 393       return false;
 394
 395    ir_rvalue *non_const_mul = mul->operands[1];
 396    ir_constant *const_mul = mul->operands[0]->as_constant();
 397    if (!const_mul) {
 398       const_mul = mul->operands[1]->as_constant();
 399
 400       if (!const_mul)
 401          return false;
 402
 403       non_const_mul = mul->operands[0];
 404    }
 405
 406    int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
 407    if (mul_operand_vf == -1)
 408       return false;
 409
 410    non_const_mul->accept(this);
 411    fs_reg src1 = this->result;
 412
 413    fs_reg src0 = vgrf(ir->type);
 414    emit(BRW_OPCODE_MOV, src0,
 415         fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
 416
 417    this->result = vgrf(ir->type);
 418    emit(BRW_OPCODE_LINE, this->result, src0, src1);
 419    return true;
 420 }
 421
 422 bool
 423 fs_visitor::try_emit_mad(ir_expression *ir)
 424 {
 425    /* 3-src instructions were introduced in gen6. */
 426    if (brw->gen < 6)
 427       return false;
 428
 429    /* MAD can only handle floating-point data. */
 430    if (ir->type != glsl_type::float_type)
 431       return false;
 432
 433    ir_rvalue *nonmul = ir->operands[1];
 434    ir_expression *mul = ir->operands[0]->as_expression();
 435
 436    bool mul_negate = false, mul_abs = false;
 437    if (mul && mul->operation == ir_unop_abs) {
 438       mul = mul->operands[0]->as_expression();
 439       mul_abs = true;
 440    } else if (mul && mul->operation == ir_unop_neg) {
 441       mul = mul->operands[0]->as_expression();
 442       mul_negate = true;
 443    }
 444
 445    if (!mul || mul->operation != ir_binop_mul) {
 446       nonmul = ir->operands[0];
 447       mul = ir->operands[1]->as_expression();
 448
 449       if (mul && mul->operation == ir_unop_abs) {
 450          mul = mul->operands[0]->as_expression();
 451          mul_abs = true;
 452       } else if (mul && mul->operation == ir_unop_neg) {
 453          mul = mul->operands[0]->as_expression();
 454          mul_negate = true;
 455       }
 456
 457       if (!mul || mul->operation != ir_binop_mul)
 458          return false;
 459    }
 460
 461    nonmul->accept(this);
 462    fs_reg src0 = this->result;
 463
 464    mul->operands[0]->accept(this);
 465    fs_reg src1 = this->result;
 466    src1.negate ^= mul_negate;
 467    src1.abs = mul_abs;
 468    if (mul_abs)
 469       src1.negate = false;
 470
 471    mul->operands[1]->accept(this);
 472    fs_reg src2 = this->result;
 473    src2.abs = mul_abs;
 474    if (mul_abs)
 475       src2.negate = false;
 476
 477    this->result = vgrf(ir->type);
 478    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 479
 480    return true;
 481 }
 482
 483 static int
 484 pack_pixel_offset(float x)
 485 {
 486    /* Clamp upper end of the range to +7/16. See explanation in non-constant
 487     * offset case below. */
 488    int n = MIN2((int)(x * 16), 7);
 489    return n & 0xf;
 490 }
 491
 492 void
 493 fs_visitor::emit_interpolate_expression(ir_expression *ir)
 494 {
 495    /* in SIMD16 mode, the pixel interpolator returns coords interleaved
 496     * 8 channels at a time, same as the barycentric coords presented in
 497     * the FS payload. this requires a bit of extra work to support.
 498     */
 499    no16("interpolate_at_* not yet supported in SIMD16 mode.");
 500
 501    assert(stage == MESA_SHADER_FRAGMENT);
 502    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 503
 504    ir_dereference * deref = ir->operands[0]->as_dereference();
 505    ir_swizzle * swiz = NULL;
 506    if (!deref) {
 507       /* the api does not allow a swizzle here, but the varying packing code
 508        * may have pushed one into here.
 509        */
 510       swiz = ir->operands[0]->as_swizzle();
 511       assert(swiz);
 512       deref = swiz->val->as_dereference();
 513    }
 514    assert(deref);
 515    ir_variable * var = deref->variable_referenced();
 516    assert(var);
 517
 518    /* 1. collect interpolation factors */
 519
 520    fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
 521    fs_reg dst_y = offset(dst_x, 1);
 522
 523    /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
 524     * even when there is no payload. in the per-slot offset case, we'll replace this with
 525     * the proper source data. */
 526    fs_reg src = vgrf(glsl_type::float_type);
 527    int mlen = 1;     /* one reg unless overriden */
 528    int reg_width = dispatch_width / 8;
 529    fs_inst *inst;
 530
 531    switch (ir->operation) {
 532    case ir_unop_interpolate_at_centroid:
 533       inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
 534       break;
 535
 536    case ir_binop_interpolate_at_sample: {
 537       ir_constant *sample_num = ir->operands[1]->as_constant();
 538       assert(sample_num || !"nonconstant sample number should have been lowered.");
 539
 540       unsigned msg_data = sample_num->value.i[0] << 4;
 541       inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
 542       break;
 543    }
 544
 545    case ir_binop_interpolate_at_offset: {
 546       ir_constant *const_offset = ir->operands[1]->as_constant();
 547       if (const_offset) {
 548          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
 549                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
 550          inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
 551                      fs_reg(msg_data));
 552       } else {
 553          /* pack the operands: hw wants offsets as 4 bit signed ints */
 554          ir->operands[1]->accept(this);
 555          src = vgrf(glsl_type::ivec2_type);
 556          fs_reg src2 = src;
 557          for (int i = 0; i < 2; i++) {
 558             fs_reg temp = vgrf(glsl_type::float_type);
 559             emit(MUL(temp, this->result, fs_reg(16.0f)));
 560             emit(MOV(src2, temp));  /* float to int */
 561
 562             /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
 563              * that we support a maximum offset of +0.5, which isn't representable
 564              * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
 565              * which is the opposite of what the shader author wanted.
 566              *
 567              * This is legal due to ARB_gpu_shader5's quantization rules:
 568              *
 569              * "Not all values of <offset> may be supported; x and y offsets may
 570              * be rounded to fixed-point values with the number of fraction bits
 571              * given by the implementation-dependent constant
 572              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
 573              */
 574
 575             fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
 576             inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
 577
 578             src2 = offset(src2, 1);
 579             this->result = offset(this->result, 1);
 580          }
 581
 582          mlen = 2 * reg_width;
 583          inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
 584                      fs_reg(0u));
 585       }
 586       break;
 587    }
 588
 589    default:
 590       unreachable("not reached");
 591    }
 592
 593    inst->mlen = mlen;
 594    inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
 595    inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
 596          INTERP_QUALIFIER_NOPERSPECTIVE;
 597
 598    /* 2. emit linterp */
 599
 600    fs_reg res = vgrf(ir->type);
 601    this->result = res;
 602
 603    for (int i = 0; i < ir->type->vector_elements; i++) {
 604       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
 605       emit(FS_OPCODE_LINTERP, res,
 606            dst_x, dst_y,
 607            fs_reg(interp_reg(var->data.location, ch)));
 608       res = offset(res, 1);
 609    }
 610 }
 611
 612 void
 613 fs_visitor::visit(ir_expression *ir)
 614 {
 615    unsigned int operand;
 616    fs_reg op[3], temp;
 617    fs_inst *inst;
 618    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 619
 620    assert(ir->get_num_operands() <= 3);
 621
 622    if (try_emit_saturate(ir))
 623       return;
 624
 625    /* Deal with the real oddball stuff first */
 626    switch (ir->operation) {
 627    case ir_binop_add:
 628       if (brw->gen <= 5 && try_emit_line(ir))
 629          return;
 630       if (try_emit_mad(ir))
 631          return;
 632       break;
 633
 634    case ir_triop_csel:
 635       ir->operands[1]->accept(this);
 636       op[1] = this->result;
 637       ir->operands[2]->accept(this);
 638       op[2] = this->result;
 639
 640       emit_bool_to_cond_code(ir->operands[0]);
 641
 642       this->result = vgrf(ir->type);
 643       inst = emit(SEL(this->result, op[1], op[2]));
 644       inst->predicate = BRW_PREDICATE_NORMAL;
 645       return;
 646
 647    case ir_unop_interpolate_at_centroid:
 648    case ir_binop_interpolate_at_offset:
 649    case ir_binop_interpolate_at_sample:
 650       emit_interpolate_expression(ir);
 651       return;
 652
 653    default:
 654       break;
 655    }
 656
 657    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 658       ir->operands[operand]->accept(this);
 659       if (this->result.file == BAD_FILE) {
 660          fail("Failed to get tree for expression operand:\n");
 661          ir->operands[operand]->fprint(stderr);
 662          fprintf(stderr, "\n");
 663       }
 664       assert(this->result.file == GRF ||
 665              this->result.file == UNIFORM || this->result.file == ATTR);
 666       op[operand] = this->result;
 667
 668       /* Matrix expression operands should have been broken down to vector
 669        * operations already.
 670        */
 671       assert(!ir->operands[operand]->type->is_matrix());
 672       /* And then those vector operands should have been broken down to scalar.
 673        */
 674       assert(!ir->operands[operand]->type->is_vector());
 675    }
 676
 677    /* Storage for our result.  If our result goes into an assignment, it will
 678     * just get copy-propagated out, so no worries.
 679     */
 680    this->result = vgrf(ir->type);
 681
 682    switch (ir->operation) {
 683    case ir_unop_logic_not:
 684       emit(NOT(this->result, op[0]));
 685       break;
 686    case ir_unop_neg:
 687       op[0].negate = !op[0].negate;
 688       emit(MOV(this->result, op[0]));
 689       break;
 690    case ir_unop_abs:
 691       op[0].abs = true;
 692       op[0].negate = false;
 693       emit(MOV(this->result, op[0]));
 694       break;
 695    case ir_unop_sign:
 696       if (ir->type->is_float()) {
 697          /* AND(val, 0x80000000) gives the sign bit.
 698           *
 699           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 700           * zero.
 701           */
 702          emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 703
 704          op[0].type = BRW_REGISTER_TYPE_UD;
 705          this->result.type = BRW_REGISTER_TYPE_UD;
 706          emit(AND(this->result, op[0], fs_reg(0x80000000u)));
 707
 708          inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
 709          inst->predicate = BRW_PREDICATE_NORMAL;
 710
 711          this->result.type = BRW_REGISTER_TYPE_F;
 712       } else {
 713          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 714           *               -> non-negative val generates 0x00000000.
 715           *  Predicated OR sets 1 if val is positive.
 716           */
 717          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
 718
 719          emit(ASR(this->result, op[0], fs_reg(31)));
 720
 721          inst = emit(OR(this->result, this->result, fs_reg(1)));
 722          inst->predicate = BRW_PREDICATE_NORMAL;
 723       }
 724       break;
 725    case ir_unop_rcp:
 726       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 727       break;
 728
 729    case ir_unop_exp2:
 730       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 731       break;
 732    case ir_unop_log2:
 733       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 734       break;
 735    case ir_unop_exp:
 736    case ir_unop_log:
 737       unreachable("not reached: should be handled by ir_explog_to_explog2");
 738    case ir_unop_sin:
 739    case ir_unop_sin_reduced:
 740       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 741       break;
 742    case ir_unop_cos:
 743    case ir_unop_cos_reduced:
 744       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 745       break;
 746
 747    case ir_unop_dFdx:
 748       /* Select one of the two opcodes based on the glHint value. */
 749       if (fs_key->high_quality_derivatives)
 750          emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 751       else
 752          emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 753       break;
 754
 755    case ir_unop_dFdx_coarse:
 756       emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 757       break;
 758
 759    case ir_unop_dFdx_fine:
 760       emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 761       break;
 762
 763    case ir_unop_dFdy:
 764       /* Select one of the two opcodes based on the glHint value. */
 765       if (fs_key->high_quality_derivatives)
 766          emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 767       else
 768          emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 769       break;
 770
 771    case ir_unop_dFdy_coarse:
 772       emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 773       break;
 774
 775    case ir_unop_dFdy_fine:
 776       emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 777       break;
 778
 779    case ir_binop_add:
 780       emit(ADD(this->result, op[0], op[1]));
 781       break;
 782    case ir_binop_sub:
 783       unreachable("not reached: should be handled by ir_sub_to_add_neg");
 784
 785    case ir_binop_mul:
 786       if (brw->gen < 8 && ir->type->is_integer()) {
 787          /* For integer multiplication, the MUL uses the low 16 bits
 788           * of one of the operands (src0 on gen6, src1 on gen7).  The
 789           * MACH accumulates in the contribution of the upper 16 bits
 790           * of that operand.
 791           */
 792          if (ir->operands[0]->is_uint16_constant()) {
 793             if (brw->gen < 7)
 794                emit(MUL(this->result, op[0], op[1]));
 795             else
 796                emit(MUL(this->result, op[1], op[0]));
 797          } else if (ir->operands[1]->is_uint16_constant()) {
 798             if (brw->gen < 7)
 799                emit(MUL(this->result, op[1], op[0]));
 800             else
 801                emit(MUL(this->result, op[0], op[1]));
 802          } else {
 803             if (brw->gen >= 7)
 804                no16("SIMD16 explicit accumulator operands unsupported\n");
 805
 806             struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 807                                         this->result.type);
 808
 809             emit(MUL(acc, op[0], op[1]));
 810             emit(MACH(reg_null_d, op[0], op[1]));
 811             emit(MOV(this->result, fs_reg(acc)));
 812          }
 813       } else {
 814          emit(MUL(this->result, op[0], op[1]));
 815       }
 816       break;
 817    case ir_binop_imul_high: {
 818       if (brw->gen == 7)
 819          no16("SIMD16 explicit accumulator operands unsupported\n");
 820
 821       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 822                                   this->result.type);
 823
 824       fs_inst *mul = emit(MUL(acc, op[0], op[1]));
 825       emit(MACH(this->result, op[0], op[1]));
 826
 827       /* Until Gen8, integer multiplies read 32-bits from one source, and
 828        * 16-bits from the other, and relying on the MACH instruction to
 829        * generate the high bits of the result.
 830        *
 831        * On Gen8, the multiply instruction does a full 32x32-bit multiply,
 832        * but in order to do a 64x64-bit multiply we have to simulate the
 833        * previous behavior and then use a MACH instruction.
 834        *
 835        * FINISHME: Don't use source modifiers on src1.
 836        */
 837       if (brw->gen >= 8) {
 838          assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
 839                 mul->src[1].type == BRW_REGISTER_TYPE_UD);
 840          if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
 841             mul->src[1].type = BRW_REGISTER_TYPE_W;
 842          } else {
 843             mul->src[1].type = BRW_REGISTER_TYPE_UW;
 844          }
 845       }
 846
 847       break;
 848    }
 849    case ir_binop_div:
 850       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 851       assert(ir->type->is_integer());
 852       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 853       break;
 854    case ir_binop_carry: {
 855       if (brw->gen == 7)
 856          no16("SIMD16 explicit accumulator operands unsupported\n");
 857
 858       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 859                                   BRW_REGISTER_TYPE_UD);
 860
 861       emit(ADDC(reg_null_ud, op[0], op[1]));
 862       emit(MOV(this->result, fs_reg(acc)));
 863       break;
 864    }
 865    case ir_binop_borrow: {
 866       if (brw->gen == 7)
 867          no16("SIMD16 explicit accumulator operands unsupported\n");
 868
 869       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 870                                   BRW_REGISTER_TYPE_UD);
 871
 872       emit(SUBB(reg_null_ud, op[0], op[1]));
 873       emit(MOV(this->result, fs_reg(acc)));
 874       break;
 875    }
 876    case ir_binop_mod:
 877       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
 878       assert(ir->type->is_integer());
 879       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 880       break;
 881
 882    case ir_binop_less:
 883    case ir_binop_greater:
 884    case ir_binop_lequal:
 885    case ir_binop_gequal:
 886    case ir_binop_equal:
 887    case ir_binop_all_equal:
 888    case ir_binop_nequal:
 889    case ir_binop_any_nequal:
 890       if (brw->gen <= 5) {
 891          resolve_bool_comparison(ir->operands[0], &op[0]);
 892          resolve_bool_comparison(ir->operands[1], &op[1]);
 893       }
 894
 895       emit(CMP(this->result, op[0], op[1],
 896                brw_conditional_for_comparison(ir->operation)));
 897       break;
 898
 899    case ir_binop_logic_xor:
 900       emit(XOR(this->result, op[0], op[1]));
 901       break;
 902
 903    case ir_binop_logic_or:
 904       emit(OR(this->result, op[0], op[1]));
 905       break;
 906
 907    case ir_binop_logic_and:
 908       emit(AND(this->result, op[0], op[1]));
 909       break;
 910
 911    case ir_binop_dot:
 912    case ir_unop_any:
 913       unreachable("not reached: should be handled by brw_fs_channel_expressions");
 914
 915    case ir_unop_noise:
 916       unreachable("not reached: should be handled by lower_noise");
 917
 918    case ir_quadop_vector:
 919       unreachable("not reached: should be handled by lower_quadop_vector");
 920
 921    case ir_binop_vector_extract:
 922       unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
 923
 924    case ir_triop_vector_insert:
 925       unreachable("not reached: should be handled by lower_vector_insert()");
 926
 927    case ir_binop_ldexp:
 928       unreachable("not reached: should be handled by ldexp_to_arith()");
 929
 930    case ir_unop_sqrt:
 931       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 932       break;
 933
 934    case ir_unop_rsq:
 935       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 936       break;
 937
 938    case ir_unop_bitcast_i2f:
 939    case ir_unop_bitcast_u2f:
 940       op[0].type = BRW_REGISTER_TYPE_F;
 941       this->result = op[0];
 942       break;
 943    case ir_unop_i2u:
 944    case ir_unop_bitcast_f2u:
 945       op[0].type = BRW_REGISTER_TYPE_UD;
 946       this->result = op[0];
 947       break;
 948    case ir_unop_u2i:
 949    case ir_unop_bitcast_f2i:
 950       op[0].type = BRW_REGISTER_TYPE_D;
 951       this->result = op[0];
 952       break;
 953    case ir_unop_i2f:
 954    case ir_unop_u2f:
 955    case ir_unop_f2i:
 956    case ir_unop_f2u:
 957       emit(MOV(this->result, op[0]));
 958       break;
 959
 960    case ir_unop_b2i:
 961       emit(AND(this->result, op[0], fs_reg(1)));
 962       break;
 963    case ir_unop_b2f:
 964       if (brw->gen <= 5) {
 965          resolve_bool_comparison(ir->operands[0], &op[0]);
 966       }
 967       op[0].type = BRW_REGISTER_TYPE_D;
 968       this->result.type = BRW_REGISTER_TYPE_D;
 969       emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
 970       this->result.type = BRW_REGISTER_TYPE_F;
 971       break;
 972
 973    case ir_unop_f2b:
 974       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 975       break;
 976    case ir_unop_i2b:
 977       emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
 978       break;
 979
 980    case ir_unop_trunc:
 981       emit(RNDZ(this->result, op[0]));
 982       break;
 983    case ir_unop_ceil: {
 984          fs_reg tmp = vgrf(ir->type);
 985          op[0].negate = !op[0].negate;
 986          emit(RNDD(tmp, op[0]));
 987          tmp.negate = true;
 988          emit(MOV(this->result, tmp));
 989       }
 990       break;
 991    case ir_unop_floor:
 992       emit(RNDD(this->result, op[0]));
 993       break;
 994    case ir_unop_fract:
 995       emit(FRC(this->result, op[0]));
 996       break;
 997    case ir_unop_round_even:
 998       emit(RNDE(this->result, op[0]));
 999       break;
1000
1001    case ir_binop_min:
1002    case ir_binop_max:
1003       resolve_ud_negate(&op[0]);
1004       resolve_ud_negate(&op[1]);
1005       emit_minmax(ir->operation == ir_binop_min ?
1006                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
1007                   this->result, op[0], op[1]);
1008       break;
1009    case ir_unop_pack_snorm_2x16:
1010    case ir_unop_pack_snorm_4x8:
1011    case ir_unop_pack_unorm_2x16:
1012    case ir_unop_pack_unorm_4x8:
1013    case ir_unop_unpack_snorm_2x16:
1014    case ir_unop_unpack_snorm_4x8:
1015    case ir_unop_unpack_unorm_2x16:
1016    case ir_unop_unpack_unorm_4x8:
1017    case ir_unop_unpack_half_2x16:
1018    case ir_unop_pack_half_2x16:
1019       unreachable("not reached: should be handled by lower_packing_builtins");
1020    case ir_unop_unpack_half_2x16_split_x:
1021       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
1022       break;
1023    case ir_unop_unpack_half_2x16_split_y:
1024       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
1025       break;
1026    case ir_binop_pow:
1027       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
1028       break;
1029
1030    case ir_unop_bitfield_reverse:
1031       emit(BFREV(this->result, op[0]));
1032       break;
1033    case ir_unop_bit_count:
1034       emit(CBIT(this->result, op[0]));
1035       break;
1036    case ir_unop_find_msb:
1037       temp = vgrf(glsl_type::uint_type);
1038       emit(FBH(temp, op[0]));
1039
1040       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1041        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1042        * subtract the result from 31 to convert the MSB count into an LSB count.
1043        */
1044
1045       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1046       emit(MOV(this->result, temp));
1047       emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
1048
1049       temp.negate = true;
1050       inst = emit(ADD(this->result, temp, fs_reg(31)));
1051       inst->predicate = BRW_PREDICATE_NORMAL;
1052       break;
1053    case ir_unop_find_lsb:
1054       emit(FBL(this->result, op[0]));
1055       break;
1056    case ir_unop_saturate:
1057       inst = emit(MOV(this->result, op[0]));
1058       inst->saturate = true;
1059       break;
1060    case ir_triop_bitfield_extract:
1061       /* Note that the instruction's argument order is reversed from GLSL
1062        * and the IR.
1063        */
1064       emit(BFE(this->result, op[2], op[1], op[0]));
1065       break;
1066    case ir_binop_bfm:
1067       emit(BFI1(this->result, op[0], op[1]));
1068       break;
1069    case ir_triop_bfi:
1070       emit(BFI2(this->result, op[0], op[1], op[2]));
1071       break;
1072    case ir_quadop_bitfield_insert:
1073       unreachable("not reached: should be handled by "
1074               "lower_instructions::bitfield_insert_to_bfm_bfi");
1075
1076    case ir_unop_bit_not:
1077       emit(NOT(this->result, op[0]));
1078       break;
1079    case ir_binop_bit_and:
1080       emit(AND(this->result, op[0], op[1]));
1081       break;
1082    case ir_binop_bit_xor:
1083       emit(XOR(this->result, op[0], op[1]));
1084       break;
1085    case ir_binop_bit_or:
1086       emit(OR(this->result, op[0], op[1]));
1087       break;
1088
1089    case ir_binop_lshift:
1090       emit(SHL(this->result, op[0], op[1]));
1091       break;
1092
1093    case ir_binop_rshift:
1094       if (ir->type->base_type == GLSL_TYPE_INT)
1095          emit(ASR(this->result, op[0], op[1]));
1096       else
1097          emit(SHR(this->result, op[0], op[1]));
1098       break;
1099    case ir_binop_pack_half_2x16_split:
1100       emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
1101       break;
1102    case ir_binop_ubo_load: {
1103       /* This IR node takes a constant uniform block and a constant or
1104        * variable byte offset within the block and loads a vector from that.
1105        */
1106       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1107       ir_constant *const_offset = ir->operands[1]->as_constant();
1108       fs_reg surf_index;
1109
1110       if (const_uniform_block) {
1111          /* The block index is a constant, so just emit the binding table entry
1112           * as an immediate.
1113           */
1114          surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
1115                                  const_uniform_block->value.u[0]);
1116       } else {
1117          /* The block index is not a constant. Evaluate the index expression
1118           * per-channel and add the base UBO index; the generator will select
1119           * a value from any live channel.
1120           */
1121          surf_index = vgrf(glsl_type::uint_type);
1122          emit(ADD(surf_index, op[0],
1123                   fs_reg(stage_prog_data->binding_table.ubo_start)))
1124             ->force_writemask_all = true;
1125
1126          /* Assume this may touch any UBO. It would be nice to provide
1127           * a tighter bound, but the array information is already lowered away.
1128           */
1129          brw_mark_surface_used(prog_data,
1130                                stage_prog_data->binding_table.ubo_start +
1131                                shader_prog->NumUniformBlocks - 1);
1132       }
1133
1134       if (const_offset) {
1135          fs_reg packed_consts = vgrf(glsl_type::float_type);
1136          packed_consts.type = result.type;
1137
1138          fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
1139          emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
1140                                    packed_consts, surf_index, const_offset_reg));
1141
1142          for (int i = 0; i < ir->type->vector_elements; i++) {
1143             packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
1144
1145             /* The std140 packing rules don't allow vectors to cross 16-byte
1146              * boundaries, and a reg is 32 bytes.
1147              */
1148             assert(packed_consts.subreg_offset < 32);
1149
1150             /* UBO bools are any nonzero value.  We consider bools to be
1151              * values with the low bit set to 1.  Convert them using CMP.
1152              */
1153             if (ir->type->base_type == GLSL_TYPE_BOOL) {
1154                emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
1155             } else {
1156                emit(MOV(result, packed_consts));
1157             }
1158
1159             result = offset(result, 1);
1160          }
1161       } else {
1162          /* Turn the byte offset into a dword offset. */
1163          fs_reg base_offset = vgrf(glsl_type::int_type);
1164          emit(SHR(base_offset, op[1], fs_reg(2)));
1165
1166          for (int i = 0; i < ir->type->vector_elements; i++) {
1167             emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
1168                                             base_offset, i));
1169
1170             if (ir->type->base_type == GLSL_TYPE_BOOL)
1171                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
1172
1173             result = offset(result, 1);
1174          }
1175       }
1176
1177       result.reg_offset = 0;
1178       break;
1179    }
1180
1181    case ir_triop_fma:
1182       /* Note that the instruction's argument order is reversed from GLSL
1183        * and the IR.
1184        */
1185       emit(MAD(this->result, op[2], op[1], op[0]));
1186       break;
1187
1188    case ir_triop_lrp:
1189       emit_lrp(this->result, op[0], op[1], op[2]);
1190       break;
1191
1192    case ir_triop_csel:
1193    case ir_unop_interpolate_at_centroid:
1194    case ir_binop_interpolate_at_offset:
1195    case ir_binop_interpolate_at_sample:
1196       unreachable("already handled above");
1197       break;
1198
1199    case ir_unop_d2f:
1200    case ir_unop_f2d:
1201    case ir_unop_d2i:
1202    case ir_unop_i2d:
1203    case ir_unop_d2u:
1204    case ir_unop_u2d:
1205    case ir_unop_d2b:
1206    case ir_unop_pack_double_2x32:
1207    case ir_unop_unpack_double_2x32:
1208    case ir_unop_frexp_sig:
1209    case ir_unop_frexp_exp:
1210       unreachable("fp64 todo");
1211       break;
1212    }
1213 }
1214
1215 void
1216 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1217                                    const glsl_type *type, bool predicated)
1218 {
1219    switch (type->base_type) {
1220    case GLSL_TYPE_FLOAT:
1221    case GLSL_TYPE_UINT:
1222    case GLSL_TYPE_INT:
1223    case GLSL_TYPE_BOOL:
1224       for (unsigned int i = 0; i < type->components(); i++) {
1225          l.type = brw_type_for_base_type(type);
1226          r.type = brw_type_for_base_type(type);
1227
1228          if (predicated || !l.equals(r)) {
1229             fs_inst *inst = emit(MOV(l, r));
1230             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
1231          }
1232
1233          l = offset(l, 1);
1234          r = offset(r, 1);
1235       }
1236       break;
1237    case GLSL_TYPE_ARRAY:
1238       for (unsigned int i = 0; i < type->length; i++) {
1239          emit_assignment_writes(l, r, type->fields.array, predicated);
1240       }
1241       break;
1242
1243    case GLSL_TYPE_STRUCT:
1244       for (unsigned int i = 0; i < type->length; i++) {
1245          emit_assignment_writes(l, r, type->fields.structure[i].type,
1246                                 predicated);
1247       }
1248       break;
1249
1250    case GLSL_TYPE_SAMPLER:
1251    case GLSL_TYPE_IMAGE:
1252    case GLSL_TYPE_ATOMIC_UINT:
1253       break;
1254
1255    case GLSL_TYPE_VOID:
1256    case GLSL_TYPE_ERROR:
1257    case GLSL_TYPE_INTERFACE:
1258       unreachable("not reached");
1259    }
1260 }
1261
1262 /* If the RHS processing resulted in an instruction generating a
1263  * temporary value, and it would be easy to rewrite the instruction to
1264  * generate its result right into the LHS instead, do so.  This ends
1265  * up reliably removing instructions where it can be tricky to do so
1266  * later without real UD chain information.
1267  */
1268 bool
1269 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1270                                    fs_reg dst,
1271                                    fs_reg src,
1272                                    fs_inst *pre_rhs_inst,
1273                                    fs_inst *last_rhs_inst)
1274 {
1275    /* Only attempt if we're doing a direct assignment. */
1276    if (ir->condition ||
1277        !(ir->lhs->type->is_scalar() ||
1278         (ir->lhs->type->is_vector() &&
1279          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
1280       return false;
1281
1282    /* Make sure the last instruction generated our source reg. */
1283    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
1284                                                     last_rhs_inst,
1285                                                     src);
1286    if (!modify)
1287       return false;
1288
1289    /* If last_rhs_inst wrote a different number of components than our LHS,
1290     * we can't safely rewrite it.
1291     */
1292    if (alloc.sizes[dst.reg] != modify->regs_written)
1293       return false;
1294
1295    /* Success!  Rewrite the instruction. */
1296    modify->dst = dst;
1297
1298    return true;
1299 }
1300
1301 void
1302 fs_visitor::visit(ir_assignment *ir)
1303 {
1304    fs_reg l, r;
1305    fs_inst *inst;
1306
1307    /* FINISHME: arrays on the lhs */
1308    ir->lhs->accept(this);
1309    l = this->result;
1310
1311    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
1312
1313    ir->rhs->accept(this);
1314    r = this->result;
1315
1316    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
1317
1318    assert(l.file != BAD_FILE);
1319    assert(r.file != BAD_FILE);
1320
1321    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
1322       return;
1323
1324    if (ir->condition) {
1325       emit_bool_to_cond_code(ir->condition);
1326    }
1327
1328    if (ir->lhs->type->is_scalar() ||
1329        ir->lhs->type->is_vector()) {
1330       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1331          if (ir->write_mask & (1 << i)) {
1332             inst = emit(MOV(l, r));
1333             if (ir->condition)
1334                inst->predicate = BRW_PREDICATE_NORMAL;
1335             r = offset(r, 1);
1336          }
1337          l = offset(l, 1);
1338       }
1339    } else {
1340       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1341    }
1342 }
1343
1344 fs_inst *
1345 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
1346                               fs_reg coordinate, int coord_components,
1347                               fs_reg shadow_c,
1348                               fs_reg lod, fs_reg dPdy, int grad_components,
1349                               uint32_t sampler)
1350 {
1351    int mlen;
1352    int base_mrf = 1;
1353    bool simd16 = false;
1354    fs_reg orig_dst;
1355
1356    /* g0 header. */
1357    mlen = 1;
1358
1359    if (shadow_c.file != BAD_FILE) {
1360       for (int i = 0; i < coord_components; i++) {
1361          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1362          coordinate = offset(coordinate, 1);
1363       }
1364
1365       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
1366        * the unused slots must be zeroed.
1367        */
1368       for (int i = coord_components; i < 3; i++) {
1369          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1370       }
1371       mlen += 3;
1372
1373       if (op == ir_tex) {
1374          /* There's no plain shadow compare message, so we use shadow
1375           * compare with a bias of 0.0.
1376           */
1377          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
1378          mlen++;
1379       } else if (op == ir_txb || op == ir_txl) {
1380          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1381          mlen++;
1382       } else {
1383          unreachable("Should not get here.");
1384       }
1385
1386       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1387       mlen++;
1388    } else if (op == ir_tex) {
1389       for (int i = 0; i < coord_components; i++) {
1390          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1391          coordinate = offset(coordinate, 1);
1392       }
1393       /* zero the others. */
1394       for (int i = coord_components; i<3; i++) {
1395          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1396       }
1397       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1398       mlen += 3;
1399    } else if (op == ir_txd) {
1400       fs_reg &dPdx = lod;
1401
1402       for (int i = 0; i < coord_components; i++) {
1403          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1404          coordinate = offset(coordinate, 1);
1405       }
1406       /* the slots for u and v are always present, but r is optional */
1407       mlen += MAX2(coord_components, 2);
1408
1409       /*  P   = u, v, r
1410        * dPdx = dudx, dvdx, drdx
1411        * dPdy = dudy, dvdy, drdy
1412        *
1413        * 1-arg: Does not exist.
1414        *
1415        * 2-arg: dudx   dvdx   dudy   dvdy
1416        *        dPdx.x dPdx.y dPdy.x dPdy.y
1417        *        m4     m5     m6     m7
1418        *
1419        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
1420        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1421        *        m5     m6     m7     m8     m9     m10
1422        */
1423       for (int i = 0; i < grad_components; i++) {
1424          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1425          dPdx = offset(dPdx, 1);
1426       }
1427       mlen += MAX2(grad_components, 2);
1428
1429       for (int i = 0; i < grad_components; i++) {
1430          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1431          dPdy = offset(dPdy, 1);
1432       }
1433       mlen += MAX2(grad_components, 2);
1434    } else if (op == ir_txs) {
1435       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
1436       simd16 = true;
1437       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1438       mlen += 2;
1439    } else {
1440       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1441        * instructions.  We'll need to do SIMD16 here.
1442        */
1443       simd16 = true;
1444       assert(op == ir_txb || op == ir_txl || op == ir_txf);
1445
1446       for (int i = 0; i < coord_components; i++) {
1447          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1448                   coordinate));
1449          coordinate = offset(coordinate, 1);
1450       }
1451
1452       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
1453        * be necessary for TXF (ld), but seems wise to do for all messages.
1454        */
1455       for (int i = coord_components; i < 3; i++) {
1456          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1457       }
1458
1459       /* lod/bias appears after u/v/r. */
1460       mlen += 6;
1461
1462       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1463       mlen++;
1464
1465       /* The unused upper half. */
1466       mlen++;
1467    }
1468
1469    if (simd16) {
1470       /* Now, since we're doing simd16, the return is 2 interleaved
1471        * vec4s where the odd-indexed ones are junk. We'll need to move
1472        * this weirdness around to the expected layout.
1473        */
1474       orig_dst = dst;
1475       dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
1476    }
1477
1478    enum opcode opcode;
1479    switch (op) {
1480    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1481    case ir_txb: opcode = FS_OPCODE_TXB; break;
1482    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1483    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1484    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1485    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1486    default:
1487       unreachable("not reached");
1488    }
1489
1490    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1491    inst->base_mrf = base_mrf;
1492    inst->mlen = mlen;
1493    inst->header_present = true;
1494    inst->regs_written = simd16 ? 8 : 4;
1495
1496    if (simd16) {
1497       for (int i = 0; i < 4; i++) {
1498          emit(MOV(orig_dst, dst));
1499          orig_dst = offset(orig_dst, 1);
1500          dst = offset(dst, 2);
1501       }
1502    }
1503
1504    return inst;
1505 }
1506
1507 /* gen5's sampler has slots for u, v, r, array index, then optional
1508  * parameters like shadow comparitor or LOD bias.  If optional
1509  * parameters aren't present, those base slots are optional and don't
1510  * need to be included in the message.
1511  *
1512  * We don't fill in the unnecessary slots regardless, which may look
1513  * surprising in the disassembly.
1514  */
1515 fs_inst *
1516 fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
1517                               fs_reg coordinate, int vector_elements,
1518                               fs_reg shadow_c,
1519                               fs_reg lod, fs_reg lod2, int grad_components,
1520                               fs_reg sample_index, uint32_t sampler,
1521                               bool has_offset)
1522 {
1523    int reg_width = dispatch_width / 8;
1524    bool header_present = false;
1525
1526    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
1527    fs_reg msg_coords = message;
1528
1529    if (has_offset) {
1530       /* The offsets set up by the ir_texture visitor are in the
1531        * m1 header, so we can't go headerless.
1532        */
1533       header_present = true;
1534       message.reg--;
1535    }
1536
1537    for (int i = 0; i < vector_elements; i++) {
1538       emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
1539       coordinate = offset(coordinate, 1);
1540    }
1541    fs_reg msg_end = offset(msg_coords, vector_elements);
1542    fs_reg msg_lod = offset(msg_coords, 4);
1543
1544    if (shadow_c.file != BAD_FILE) {
1545       fs_reg msg_shadow = msg_lod;
1546       emit(MOV(msg_shadow, shadow_c));
1547       msg_lod = offset(msg_shadow, 1);
1548       msg_end = msg_lod;
1549    }
1550
1551    enum opcode opcode;
1552    switch (op) {
1553    case ir_tex:
1554       opcode = SHADER_OPCODE_TEX;
1555       break;
1556    case ir_txb:
1557       emit(MOV(msg_lod, lod));
1558       msg_end = offset(msg_lod, 1);
1559
1560       opcode = FS_OPCODE_TXB;
1561       break;
1562    case ir_txl:
1563       emit(MOV(msg_lod, lod));
1564       msg_end = offset(msg_lod, 1);
1565
1566       opcode = SHADER_OPCODE_TXL;
1567       break;
1568    case ir_txd: {
1569       /**
1570        *  P   =  u,    v,    r
1571        * dPdx = dudx, dvdx, drdx
1572        * dPdy = dudy, dvdy, drdy
1573        *
1574        * Load up these values:
1575        * - dudx   dudy   dvdx   dvdy   drdx   drdy
1576        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1577        */
1578       msg_end = msg_lod;
1579       for (int i = 0; i < grad_components; i++) {
1580          emit(MOV(msg_end, lod));
1581          lod = offset(lod, 1);
1582          msg_end = offset(msg_end, 1);
1583
1584          emit(MOV(msg_end, lod2));
1585          lod2 = offset(lod2, 1);
1586          msg_end = offset(msg_end, 1);
1587       }
1588
1589       opcode = SHADER_OPCODE_TXD;
1590       break;
1591    }
1592    case ir_txs:
1593       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
1594       emit(MOV(msg_lod, lod));
1595       msg_end = offset(msg_lod, 1);
1596
1597       opcode = SHADER_OPCODE_TXS;
1598       break;
1599    case ir_query_levels:
1600       msg_lod = msg_end;
1601       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1602       msg_end = offset(msg_lod, 1);
1603
1604       opcode = SHADER_OPCODE_TXS;
1605       break;
1606    case ir_txf:
1607       msg_lod = offset(msg_coords, 3);
1608       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
1609       msg_end = offset(msg_lod, 1);
1610
1611       opcode = SHADER_OPCODE_TXF;
1612       break;
1613    case ir_txf_ms:
1614       msg_lod = offset(msg_coords, 3);
1615       /* lod */
1616       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1617       /* sample index */
1618       emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
1619       msg_end = offset(msg_lod, 2);
1620
1621       opcode = SHADER_OPCODE_TXF_CMS;
1622       break;
1623    case ir_lod:
1624       opcode = SHADER_OPCODE_LOD;
1625       break;
1626    case ir_tg4:
1627       opcode = SHADER_OPCODE_TG4;
1628       break;
1629    default:
1630       unreachable("not reached");
1631    }
1632
1633    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1634    inst->base_mrf = message.reg;
1635    inst->mlen = msg_end.reg - message.reg;
1636    inst->header_present = header_present;
1637    inst->regs_written = 4 * reg_width;
1638
1639    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1640       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1641            " disallowed by hardware\n");
1642    }
1643
1644    return inst;
1645 }
1646
1647 static bool
1648 is_high_sampler(struct brw_context *brw, fs_reg sampler)
1649 {
1650    if (brw->gen < 8 && !brw->is_haswell)
1651       return false;
1652
1653    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
1654 }
1655
1656 fs_inst *
1657 fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
1658                               fs_reg coordinate, int coord_components,
1659                               fs_reg shadow_c,
1660                               fs_reg lod, fs_reg lod2, int grad_components,
1661                               fs_reg sample_index, fs_reg mcs, fs_reg sampler,
1662                               fs_reg offset_value)
1663 {
1664    int reg_width = dispatch_width / 8;
1665    bool header_present = false;
1666
1667    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
1668    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
1669       sources[i] = vgrf(glsl_type::float_type);
1670    }
1671    int length = 0;
1672
1673    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
1674        is_high_sampler(brw, sampler)) {
1675       /* For general texture offsets (no txf workaround), we need a header to
1676        * put them in.  Note that for SIMD16 we're making space for two actual
1677        * hardware registers here, so the emit will have to fix up for this.
1678        *
1679        * * ir4_tg4 needs to place its channel select in the header,
1680        * for interaction with ARB_texture_swizzle
1681        *
1682        * The sampler index is only 4-bits, so for larger sampler numbers we
1683        * need to offset the Sampler State Pointer in the header.
1684        */
1685       header_present = true;
1686       sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1687       length++;
1688    }
1689
1690    if (shadow_c.file != BAD_FILE) {
1691       emit(MOV(sources[length], shadow_c));
1692       length++;
1693    }
1694
1695    bool has_nonconstant_offset =
1696       offset_value.file != BAD_FILE && offset_value.file != IMM;
1697    bool coordinate_done = false;
1698
1699    /* Set up the LOD info */
1700    switch (op) {
1701    case ir_tex:
1702    case ir_lod:
1703       break;
1704    case ir_txb:
1705       emit(MOV(sources[length], lod));
1706       length++;
1707       break;
1708    case ir_txl:
1709       emit(MOV(sources[length], lod));
1710       length++;
1711       break;
1712    case ir_txd: {
1713       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1714
1715       /* Load dPdx and the coordinate together:
1716        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1717        */
1718       for (int i = 0; i < coord_components; i++) {
1719          emit(MOV(sources[length], coordinate));
1720          coordinate = offset(coordinate, 1);
1721          length++;
1722
1723          /* For cube map array, the coordinate is (u,v,r,ai) but there are
1724           * only derivatives for (u, v, r).
1725           */
1726          if (i < grad_components) {
1727             emit(MOV(sources[length], lod));
1728             lod = offset(lod, 1);
1729             length++;
1730
1731             emit(MOV(sources[length], lod2));
1732             lod2 = offset(lod2, 1);
1733             length++;
1734          }
1735       }
1736
1737       coordinate_done = true;
1738       break;
1739    }
1740    case ir_txs:
1741       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
1742       length++;
1743       break;
1744    case ir_query_levels:
1745       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1746       length++;
1747       break;
1748    case ir_txf:
1749       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1750       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1751       coordinate = offset(coordinate, 1);
1752       length++;
1753
1754       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
1755       length++;
1756
1757       for (int i = 1; i < coord_components; i++) {
1758          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1759          coordinate = offset(coordinate, 1);
1760          length++;
1761       }
1762
1763       coordinate_done = true;
1764       break;
1765    case ir_txf_ms:
1766       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
1767       length++;
1768
1769       /* data from the multisample control surface */
1770       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
1771       length++;
1772
1773       /* there is no offsetting for this message; just copy in the integer
1774        * texture coordinates
1775        */
1776       for (int i = 0; i < coord_components; i++) {
1777          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1778          coordinate = offset(coordinate, 1);
1779          length++;
1780       }
1781
1782       coordinate_done = true;
1783       break;
1784    case ir_tg4:
1785       if (has_nonconstant_offset) {
1786          if (shadow_c.file != BAD_FILE)
1787             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
1788
1789          /* More crazy intermixing */
1790          for (int i = 0; i < 2; i++) { /* u, v */
1791             emit(MOV(sources[length], coordinate));
1792             coordinate = offset(coordinate, 1);
1793             length++;
1794          }
1795
1796          for (int i = 0; i < 2; i++) { /* offu, offv */
1797             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
1798             offset_value = offset(offset_value, 1);
1799             length++;
1800          }
1801
1802          if (coord_components == 3) { /* r if present */
1803             emit(MOV(sources[length], coordinate));
1804             coordinate = offset(coordinate, 1);
1805             length++;
1806          }
1807
1808          coordinate_done = true;
1809       }
1810       break;
1811    }
1812
1813    /* Set up the coordinate (except for cases where it was done above) */
1814    if (!coordinate_done) {
1815       for (int i = 0; i < coord_components; i++) {
1816          emit(MOV(sources[length], coordinate));
1817          coordinate = offset(coordinate, 1);
1818          length++;
1819       }
1820    }
1821
1822    int mlen;
1823    if (reg_width == 2)
1824       mlen = length * reg_width - header_present;
1825    else
1826       mlen = length * reg_width;
1827
1828    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
1829                                BRW_REGISTER_TYPE_F);
1830    emit(LOAD_PAYLOAD(src_payload, sources, length));
1831
1832    /* Generate the SEND */
1833    enum opcode opcode;
1834    switch (op) {
1835    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1836    case ir_txb: opcode = FS_OPCODE_TXB; break;
1837    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1838    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1839    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1840    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
1841    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1842    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
1843    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
1844    case ir_tg4:
1845       if (has_nonconstant_offset)
1846          opcode = SHADER_OPCODE_TG4_OFFSET;
1847       else
1848          opcode = SHADER_OPCODE_TG4;
1849       break;
1850    default:
1851       unreachable("not reached");
1852    }
1853    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
1854    inst->base_mrf = -1;
1855    inst->mlen = mlen;
1856    inst->header_present = header_present;
1857    inst->regs_written = 4 * reg_width;
1858
1859    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1860       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1861            " disallowed by hardware\n");
1862    }
1863
1864    return inst;
1865 }
1866
1867 static struct brw_sampler_prog_key_data *
1868 get_tex(gl_shader_stage stage, const void *key)
1869 {
1870    switch (stage) {
1871    case MESA_SHADER_FRAGMENT:
1872       return &((brw_wm_prog_key*) key)->tex;
1873    case MESA_SHADER_VERTEX:
1874       return &((brw_vue_prog_key*) key)->tex;
1875    default:
1876       unreachable("unhandled shader stage");
1877    }
1878 }
1879
1880 fs_reg
1881 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
1882                              bool is_rect, uint32_t sampler, int texunit)
1883 {
1884    fs_inst *inst = NULL;
1885    bool needs_gl_clamp = true;
1886    fs_reg scale_x, scale_y;
1887    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
1888
1889    /* The 965 requires the EU to do the normalization of GL rectangle
1890     * texture coordinates.  We use the program parameter state
1891     * tracking to get the scaling factor.
1892     */
1893    if (is_rect &&
1894        (brw->gen < 6 ||
1895         (brw->gen >= 6 && (tex->gl_clamp_mask[0] & (1 << sampler) ||
1896                            tex->gl_clamp_mask[1] & (1 << sampler))))) {
1897       struct gl_program_parameter_list *params = prog->Parameters;
1898       int tokens[STATE_LENGTH] = {
1899          STATE_INTERNAL,
1900          STATE_TEXRECT_SCALE,
1901          texunit,
1902          0,
1903          0
1904       };
1905
1906       no16("rectangle scale uniform setup not supported on SIMD16\n");
1907       if (dispatch_width == 16) {
1908          return coordinate;
1909       }
1910
1911       GLuint index = _mesa_add_state_reference(params,
1912                                                (gl_state_index *)tokens);
1913       /* Try to find existing copies of the texrect scale uniforms. */
1914       for (unsigned i = 0; i < uniforms; i++) {
1915          if (stage_prog_data->param[i] ==
1916              &prog->Parameters->ParameterValues[index][0]) {
1917             scale_x = fs_reg(UNIFORM, i);
1918             scale_y = fs_reg(UNIFORM, i + 1);
1919             break;
1920          }
1921       }
1922
1923       /* If we didn't already set them up, do so now. */
1924       if (scale_x.file == BAD_FILE) {
1925          scale_x = fs_reg(UNIFORM, uniforms);
1926          scale_y = fs_reg(UNIFORM, uniforms + 1);
1927
1928          stage_prog_data->param[uniforms++] =
1929             &prog->Parameters->ParameterValues[index][0];
1930          stage_prog_data->param[uniforms++] =
1931             &prog->Parameters->ParameterValues[index][1];
1932       }
1933    }
1934
1935    /* The 965 requires the EU to do the normalization of GL rectangle
1936     * texture coordinates.  We use the program parameter state
1937     * tracking to get the scaling factor.
1938     */
1939    if (brw->gen < 6 && is_rect) {
1940       fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
1941       fs_reg src = coordinate;
1942       coordinate = dst;
1943
1944       emit(MUL(dst, src, scale_x));
1945       dst = offset(dst, 1);
1946       src = offset(src, 1);
1947       emit(MUL(dst, src, scale_y));
1948    } else if (is_rect) {
1949       /* On gen6+, the sampler handles the rectangle coordinates
1950        * natively, without needing rescaling.  But that means we have
1951        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1952        * not [0, 1] like the default case below.
1953        */
1954       needs_gl_clamp = false;
1955
1956       for (int i = 0; i < 2; i++) {
1957          if (tex->gl_clamp_mask[i] & (1 << sampler)) {
1958             fs_reg chan = coordinate;
1959             chan = offset(chan, i);
1960
1961             inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
1962             inst->conditional_mod = BRW_CONDITIONAL_GE;
1963
1964             /* Our parameter comes in as 1.0/width or 1.0/height,
1965              * because that's what people normally want for doing
1966              * texture rectangle handling.  We need width or height
1967              * for clamping, but we don't care enough to make a new
1968              * parameter type, so just invert back.
1969              */
1970             fs_reg limit = vgrf(glsl_type::float_type);
1971             emit(MOV(limit, i == 0 ? scale_x : scale_y));
1972             emit(SHADER_OPCODE_RCP, limit, limit);
1973
1974             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1975             inst->conditional_mod = BRW_CONDITIONAL_L;
1976          }
1977       }
1978    }
1979
1980    if (coord_components > 0 && needs_gl_clamp) {
1981       for (int i = 0; i < MIN2(coord_components, 3); i++) {
1982          if (tex->gl_clamp_mask[i] & (1 << sampler)) {
1983             fs_reg chan = coordinate;
1984             chan = offset(chan, i);
1985
1986             fs_inst *inst = emit(MOV(chan, chan));
1987             inst->saturate = true;
1988          }
1989       }
1990    }
1991    return coordinate;
1992 }
1993
1994 /* Sample from the MCS surface attached to this multisample texture. */
1995 fs_reg
1996 fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
1997 {
1998    int reg_width = dispatch_width / 8;
1999    fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
2000                            BRW_REGISTER_TYPE_F);
2001    fs_reg dest = vgrf(glsl_type::uvec4_type);
2002    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
2003
2004    /* parameters are: u, v, r; missing parameters are treated as zero */
2005    for (int i = 0; i < components; i++) {
2006       sources[i] = vgrf(glsl_type::float_type);
2007       emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
2008       coordinate = offset(coordinate, 1);
2009    }
2010
2011    emit(LOAD_PAYLOAD(payload, sources, components));
2012
2013    fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
2014    inst->base_mrf = -1;
2015    inst->mlen = components * reg_width;
2016    inst->header_present = false;
2017    inst->regs_written = 4 * reg_width; /* we only care about one reg of
2018                                         * response, but the sampler always
2019                                         * writes 4/8
2020                                         */
2021
2022    return dest;
2023 }
2024
2025 void
2026 fs_visitor::emit_texture(ir_texture_opcode op,
2027                          const glsl_type *dest_type,
2028                          fs_reg coordinate, int coord_components,
2029                          fs_reg shadow_c,
2030                          fs_reg lod, fs_reg lod2, int grad_components,
2031                          fs_reg sample_index,
2032                          fs_reg offset_value, unsigned offset_components,
2033                          fs_reg mcs,
2034                          int gather_component,
2035                          bool is_cube_array,
2036                          bool is_rect,
2037                          uint32_t sampler,
2038                          fs_reg sampler_reg, int texunit)
2039 {
2040    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2041    fs_inst *inst = NULL;
2042
2043    if (op == ir_tg4) {
2044       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2045        * emitting anything other than setting up the constant result.
2046        */
2047       int swiz = GET_SWZ(tex->swizzles[sampler], gather_component);
2048       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2049
2050          fs_reg res = vgrf(glsl_type::vec4_type);
2051          this->result = res;
2052
2053          for (int i=0; i<4; i++) {
2054             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
2055             res = offset(res, 1);
2056          }
2057          return;
2058       }
2059    }
2060
2061    if (coordinate.file != BAD_FILE) {
2062       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
2063        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
2064        */
2065       coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
2066                                     sampler, texunit);
2067    }
2068
2069    /* Writemasking doesn't eliminate channels on SIMD8 texture
2070     * samples, so don't worry about them.
2071     */
2072    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
2073
2074    if (brw->gen >= 7) {
2075       inst = emit_texture_gen7(op, dst, coordinate, coord_components,
2076                                shadow_c, lod, lod2, grad_components,
2077                                sample_index, mcs, sampler_reg,
2078                                offset_value);
2079    } else if (brw->gen >= 5) {
2080       inst = emit_texture_gen5(op, dst, coordinate, coord_components,
2081                                shadow_c, lod, lod2, grad_components,
2082                                sample_index, sampler,
2083                                offset_value.file != BAD_FILE);
2084    } else {
2085       inst = emit_texture_gen4(op, dst, coordinate, coord_components,
2086                                shadow_c, lod, lod2, grad_components,
2087                                sampler);
2088    }
2089
2090    if (shadow_c.file != BAD_FILE)
2091       inst->shadow_compare = true;
2092
2093    if (offset_value.file == IMM)
2094       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
2095
2096    if (op == ir_tg4) {
2097       inst->offset |=
2098          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
2099
2100       if (brw->gen == 6)
2101          emit_gen6_gather_wa(tex->gen6_gather_wa[sampler], dst);
2102    }
2103
2104    /* fixup #layers for cube map arrays */
2105    if (op == ir_txs && is_cube_array) {
2106       fs_reg depth = offset(dst, 2);
2107       fs_reg fixed_depth = vgrf(glsl_type::int_type);
2108       emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
2109
2110       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
2111       int components = inst->regs_written / (dst.width / 8);
2112       for (int i = 0; i < components; i++) {
2113          if (i == 2) {
2114             fixed_payload[i] = fixed_depth;
2115          } else {
2116             fixed_payload[i] = offset(dst, i);
2117          }
2118       }
2119       emit(LOAD_PAYLOAD(dst, fixed_payload, components));
2120    }
2121
2122    swizzle_result(op, dest_type->vector_elements, dst, sampler);
2123 }
2124
2125 void
2126 fs_visitor::visit(ir_texture *ir)
2127 {
2128    const struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2129    uint32_t sampler =
2130       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2131
2132    ir_rvalue *nonconst_sampler_index =
2133       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2134
2135    /* Handle non-constant sampler array indexing */
2136    fs_reg sampler_reg;
2137    if (nonconst_sampler_index) {
2138       /* The highest sampler which may be used by this operation is
2139        * the last element of the array. Mark it here, because the generator
2140        * doesn't have enough information to determine the bound.
2141        */
2142       uint32_t array_size = ir->sampler->as_dereference_array()
2143          ->array->type->array_size();
2144
2145       uint32_t max_used = sampler + array_size - 1;
2146       if (ir->op == ir_tg4 && brw->gen < 8) {
2147          max_used += stage_prog_data->binding_table.gather_texture_start;
2148       } else {
2149          max_used += stage_prog_data->binding_table.texture_start;
2150       }
2151
2152       brw_mark_surface_used(prog_data, max_used);
2153
2154       /* Emit code to evaluate the actual indexing expression */
2155       nonconst_sampler_index->accept(this);
2156       fs_reg temp = vgrf(glsl_type::uint_type);
2157       emit(ADD(temp, this->result, fs_reg(sampler)))
2158             ->force_writemask_all = true;
2159       sampler_reg = temp;
2160    } else {
2161       /* Single sampler, or constant array index; the indexing expression
2162        * is just an immediate.
2163        */
2164       sampler_reg = fs_reg(sampler);
2165    }
2166
2167    /* FINISHME: We're failing to recompile our programs when the sampler is
2168     * updated.  This only matters for the texture rectangle scale parameters
2169     * (pre-gen6, or gen6+ with GL_CLAMP).
2170     */
2171    int texunit = prog->SamplerUnits[sampler];
2172
2173    /* Should be lowered by do_lower_texture_projection */
2174    assert(!ir->projector);
2175
2176    /* Should be lowered */
2177    assert(!ir->offset || !ir->offset->type->is_array());
2178
2179    /* Generate code to compute all the subexpression trees.  This has to be
2180     * done before loading any values into MRFs for the sampler message since
2181     * generating these values may involve SEND messages that need the MRFs.
2182     */
2183    fs_reg coordinate;
2184    int coord_components = 0;
2185    if (ir->coordinate) {
2186       coord_components = ir->coordinate->type->vector_elements;
2187       ir->coordinate->accept(this);
2188       coordinate = this->result;
2189    }
2190
2191    fs_reg shadow_comparitor;
2192    if (ir->shadow_comparitor) {
2193       ir->shadow_comparitor->accept(this);
2194       shadow_comparitor = this->result;
2195    }
2196
2197    fs_reg offset_value;
2198    int offset_components = 0;
2199    if (ir->offset) {
2200       ir_constant *const_offset = ir->offset->as_constant();
2201       if (const_offset) {
2202          /* Store the header bitfield in an IMM register.  This allows us to
2203           * use offset_value.file to distinguish between no offset, a constant
2204           * offset, and a non-constant offset.
2205           */
2206          offset_value =
2207             fs_reg(brw_texture_offset(ctx, const_offset->value.i,
2208                                       const_offset->type->vector_elements));
2209       } else {
2210          ir->offset->accept(this);
2211          offset_value = this->result;
2212       }
2213       offset_components = ir->offset->type->vector_elements;
2214    }
2215
2216    fs_reg lod, lod2, sample_index, mcs;
2217    int grad_components = 0;
2218    switch (ir->op) {
2219    case ir_tex:
2220    case ir_lod:
2221    case ir_tg4:
2222    case ir_query_levels:
2223       break;
2224    case ir_txb:
2225       ir->lod_info.bias->accept(this);
2226       lod = this->result;
2227       break;
2228    case ir_txd:
2229       ir->lod_info.grad.dPdx->accept(this);
2230       lod = this->result;
2231
2232       ir->lod_info.grad.dPdy->accept(this);
2233       lod2 = this->result;
2234
2235       grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
2236       break;
2237    case ir_txf:
2238    case ir_txl:
2239    case ir_txs:
2240       ir->lod_info.lod->accept(this);
2241       lod = this->result;
2242       break;
2243    case ir_txf_ms:
2244       ir->lod_info.sample_index->accept(this);
2245       sample_index = this->result;
2246
2247       if (brw->gen >= 7 && tex->compressed_multisample_layout_mask & (1<<sampler))
2248          mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
2249                               sampler_reg);
2250       else
2251          mcs = fs_reg(0u);
2252       break;
2253    default:
2254       unreachable("Unrecognized texture opcode");
2255    };
2256
2257    int gather_component = 0;
2258    if (ir->op == ir_tg4)
2259       gather_component = ir->lod_info.component->as_constant()->value.i[0];
2260
2261    bool is_rect =
2262       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
2263
2264    bool is_cube_array =
2265       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2266       ir->sampler->type->sampler_array;
2267
2268    emit_texture(ir->op, ir->type, coordinate, coord_components,
2269                 shadow_comparitor, lod, lod2, grad_components,
2270                 sample_index, offset_value, offset_components, mcs,
2271                 gather_component, is_cube_array, is_rect, sampler,
2272                 sampler_reg, texunit);
2273 }
2274
2275 /**
2276  * Apply workarounds for Gen6 gather with UINT/SINT
2277  */
2278 void
2279 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
2280 {
2281    if (!wa)
2282       return;
2283
2284    int width = (wa & WA_8BIT) ? 8 : 16;
2285
2286    for (int i = 0; i < 4; i++) {
2287       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
2288       /* Convert from UNORM to UINT */
2289       emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
2290       emit(MOV(dst, dst_f));
2291
2292       if (wa & WA_SIGN) {
2293          /* Reinterpret the UINT value as a signed INT value by
2294           * shifting the sign bit into place, then shifting back
2295           * preserving sign.
2296           */
2297          emit(SHL(dst, dst, fs_reg(32 - width)));
2298          emit(ASR(dst, dst, fs_reg(32 - width)));
2299       }
2300
2301       dst = offset(dst, 1);
2302    }
2303 }
2304
2305 /**
2306  * Set up the gather channel based on the swizzle, for gather4.
2307  */
2308 uint32_t
2309 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
2310 {
2311    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2312    int swiz = GET_SWZ(tex->swizzles[sampler], orig_chan);
2313    switch (swiz) {
2314       case SWIZZLE_X: return 0;
2315       case SWIZZLE_Y:
2316          /* gather4 sampler is broken for green channel on RG32F --
2317           * we must ask for blue instead.
2318           */
2319          if (tex->gather_channel_quirk_mask & (1<<sampler))
2320             return 2;
2321          return 1;
2322       case SWIZZLE_Z: return 2;
2323       case SWIZZLE_W: return 3;
2324       default:
2325          unreachable("Not reached"); /* zero, one swizzles handled already */
2326    }
2327 }
2328
2329 /**
2330  * Swizzle the result of a texture result.  This is necessary for
2331  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
2332  */
2333 void
2334 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
2335                            fs_reg orig_val, uint32_t sampler)
2336 {
2337    if (op == ir_query_levels) {
2338       /* # levels is in .w */
2339       this->result = offset(orig_val, 3);
2340       return;
2341    }
2342
2343    this->result = orig_val;
2344
2345    /* txs,lod don't actually sample the texture, so swizzling the result
2346     * makes no sense.
2347     */
2348    if (op == ir_txs || op == ir_lod || op == ir_tg4)
2349       return;
2350
2351    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2352
2353    if (dest_components == 1) {
2354       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
2355    } else if (tex->swizzles[sampler] != SWIZZLE_NOOP) {
2356       fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
2357       swizzled_result.type = orig_val.type;
2358
2359       for (int i = 0; i < 4; i++) {
2360          int swiz = GET_SWZ(tex->swizzles[sampler], i);
2361          fs_reg l = swizzled_result;
2362          l = offset(l, i);
2363
2364          if (swiz == SWIZZLE_ZERO) {
2365             emit(MOV(l, fs_reg(0.0f)));
2366          } else if (swiz == SWIZZLE_ONE) {
2367             emit(MOV(l, fs_reg(1.0f)));
2368          } else {
2369             emit(MOV(l, offset(orig_val,
2370                                GET_SWZ(tex->swizzles[sampler], i))));
2371          }
2372       }
2373       this->result = swizzled_result;
2374    }
2375 }
2376
2377 void
2378 fs_visitor::visit(ir_swizzle *ir)
2379 {
2380    ir->val->accept(this);
2381    fs_reg val = this->result;
2382
2383    if (ir->type->vector_elements == 1) {
2384       this->result = offset(this->result, ir->mask.x);
2385       return;
2386    }
2387
2388    fs_reg result = vgrf(ir->type);
2389    this->result = result;
2390
2391    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
2392       fs_reg channel = val;
2393       int swiz = 0;
2394
2395       switch (i) {
2396       case 0:
2397          swiz = ir->mask.x;
2398          break;
2399       case 1:
2400          swiz = ir->mask.y;
2401          break;
2402       case 2:
2403          swiz = ir->mask.z;
2404          break;
2405       case 3:
2406          swiz = ir->mask.w;
2407          break;
2408       }
2409
2410       emit(MOV(result, offset(channel, swiz)));
2411       result = offset(result, 1);
2412    }
2413 }
2414
2415 void
2416 fs_visitor::visit(ir_discard *ir)
2417 {
2418    assert(ir->condition == NULL); /* FINISHME */
2419
2420    /* We track our discarded pixels in f0.1.  By predicating on it, we can
2421     * update just the flag bits that aren't yet discarded.  By emitting a
2422     * CMP of g0 != g0, all our currently executing channels will get turned
2423     * off.
2424     */
2425    fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2426                                    BRW_REGISTER_TYPE_UW));
2427    fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
2428                            BRW_CONDITIONAL_NZ));
2429    cmp->predicate = BRW_PREDICATE_NORMAL;
2430    cmp->flag_subreg = 1;
2431
2432    if (brw->gen >= 6) {
2433       /* For performance, after a discard, jump to the end of the shader.
2434        * Only jump if all relevant channels have been discarded.
2435        */
2436       fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
2437       discard_jump->flag_subreg = 1;
2438
2439       discard_jump->predicate = (dispatch_width == 8)
2440                                 ? BRW_PREDICATE_ALIGN1_ANY8H
2441                                 : BRW_PREDICATE_ALIGN1_ANY16H;
2442       discard_jump->predicate_inverse = true;
2443    }
2444 }
2445
2446 void
2447 fs_visitor::visit(ir_constant *ir)
2448 {
2449    /* Set this->result to reg at the bottom of the function because some code
2450     * paths will cause this visitor to be applied to other fields.  This will
2451     * cause the value stored in this->result to be modified.
2452     *
2453     * Make reg constant so that it doesn't get accidentally modified along the
2454     * way.  Yes, I actually had this problem. :(
2455     */
2456    const fs_reg reg = vgrf(ir->type);
2457    fs_reg dst_reg = reg;
2458
2459    if (ir->type->is_array()) {
2460       const unsigned size = type_size(ir->type->fields.array);
2461
2462       for (unsigned i = 0; i < ir->type->length; i++) {
2463          ir->array_elements[i]->accept(this);
2464          fs_reg src_reg = this->result;
2465
2466          dst_reg.type = src_reg.type;
2467          for (unsigned j = 0; j < size; j++) {
2468             emit(MOV(dst_reg, src_reg));
2469             src_reg = offset(src_reg, 1);
2470             dst_reg = offset(dst_reg, 1);
2471          }
2472       }
2473    } else if (ir->type->is_record()) {
2474       foreach_in_list(ir_constant, field, &ir->components) {
2475          const unsigned size = type_size(field->type);
2476
2477          field->accept(this);
2478          fs_reg src_reg = this->result;
2479
2480          dst_reg.type = src_reg.type;
2481          for (unsigned j = 0; j < size; j++) {
2482             emit(MOV(dst_reg, src_reg));
2483             src_reg = offset(src_reg, 1);
2484             dst_reg = offset(dst_reg, 1);
2485          }
2486       }
2487    } else {
2488       const unsigned size = type_size(ir->type);
2489
2490       for (unsigned i = 0; i < size; i++) {
2491          switch (ir->type->base_type) {
2492          case GLSL_TYPE_FLOAT:
2493             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
2494             break;
2495          case GLSL_TYPE_UINT:
2496             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
2497             break;
2498          case GLSL_TYPE_INT:
2499             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
2500             break;
2501          case GLSL_TYPE_BOOL:
2502             emit(MOV(dst_reg,
2503                      fs_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2504                                                 : 0)));
2505             break;
2506          default:
2507             unreachable("Non-float/uint/int/bool constant");
2508          }
2509          dst_reg = offset(dst_reg, 1);
2510       }
2511    }
2512
2513    this->result = reg;
2514 }
2515
2516 void
2517 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
2518 {
2519    ir_expression *expr = ir->as_expression();
2520
2521    if (!expr || expr->operation == ir_binop_ubo_load) {
2522       ir->accept(this);
2523
2524       fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
2525       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2526       return;
2527    }
2528
2529    fs_reg op[3];
2530    fs_inst *inst;
2531
2532    assert(expr->get_num_operands() <= 3);
2533    for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2534       assert(expr->operands[i]->type->is_scalar());
2535
2536       expr->operands[i]->accept(this);
2537       op[i] = this->result;
2538
2539       resolve_ud_negate(&op[i]);
2540    }
2541
2542    switch (expr->operation) {
2543    case ir_unop_logic_not:
2544       inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
2545       inst->conditional_mod = BRW_CONDITIONAL_Z;
2546       break;
2547
2548    case ir_binop_logic_xor:
2549       if (brw->gen <= 5) {
2550          fs_reg temp = vgrf(ir->type);
2551          emit(XOR(temp, op[0], op[1]));
2552          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2553       } else {
2554          inst = emit(XOR(reg_null_d, op[0], op[1]));
2555       }
2556       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2557       break;
2558
2559    case ir_binop_logic_or:
2560       if (brw->gen <= 5) {
2561          fs_reg temp = vgrf(ir->type);
2562          emit(OR(temp, op[0], op[1]));
2563          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2564       } else {
2565          inst = emit(OR(reg_null_d, op[0], op[1]));
2566       }
2567       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2568       break;
2569
2570    case ir_binop_logic_and:
2571       if (brw->gen <= 5) {
2572          fs_reg temp = vgrf(ir->type);
2573          emit(AND(temp, op[0], op[1]));
2574          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2575       } else {
2576          inst = emit(AND(reg_null_d, op[0], op[1]));
2577       }
2578       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2579       break;
2580
2581    case ir_unop_f2b:
2582       if (brw->gen >= 6) {
2583          emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
2584       } else {
2585          inst = emit(MOV(reg_null_f, op[0]));
2586          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2587       }
2588       break;
2589
2590    case ir_unop_i2b:
2591       if (brw->gen >= 6) {
2592          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2593       } else {
2594          inst = emit(MOV(reg_null_d, op[0]));
2595          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2596       }
2597       break;
2598
2599    case ir_binop_greater:
2600    case ir_binop_gequal:
2601    case ir_binop_less:
2602    case ir_binop_lequal:
2603    case ir_binop_equal:
2604    case ir_binop_all_equal:
2605    case ir_binop_nequal:
2606    case ir_binop_any_nequal:
2607       if (brw->gen <= 5) {
2608          resolve_bool_comparison(expr->operands[0], &op[0]);
2609          resolve_bool_comparison(expr->operands[1], &op[1]);
2610       }
2611
2612       emit(CMP(reg_null_d, op[0], op[1],
2613                brw_conditional_for_comparison(expr->operation)));
2614       break;
2615
2616    case ir_triop_csel: {
2617       /* Expand the boolean condition into the flag register. */
2618       inst = emit(MOV(reg_null_d, op[0]));
2619       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2620
2621       /* Select which boolean to return. */
2622       fs_reg temp = vgrf(expr->operands[1]->type);
2623       inst = emit(SEL(temp, op[1], op[2]));
2624       inst->predicate = BRW_PREDICATE_NORMAL;
2625
2626       /* Expand the result to a condition code. */
2627       inst = emit(MOV(reg_null_d, temp));
2628       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2629       break;
2630    }
2631
2632    default:
2633       unreachable("not reached");
2634    }
2635 }
2636
2637 /**
2638  * Emit a gen6 IF statement with the comparison folded into the IF
2639  * instruction.
2640  */
2641 void
2642 fs_visitor::emit_if_gen6(ir_if *ir)
2643 {
2644    ir_expression *expr = ir->condition->as_expression();
2645
2646    if (expr && expr->operation != ir_binop_ubo_load) {
2647       fs_reg op[3];
2648       fs_inst *inst;
2649       fs_reg temp;
2650
2651       assert(expr->get_num_operands() <= 3);
2652       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2653          assert(expr->operands[i]->type->is_scalar());
2654
2655          expr->operands[i]->accept(this);
2656          op[i] = this->result;
2657       }
2658
2659       switch (expr->operation) {
2660       case ir_unop_logic_not:
2661          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
2662          return;
2663
2664       case ir_binop_logic_xor:
2665          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
2666          return;
2667
2668       case ir_binop_logic_or:
2669          temp = vgrf(glsl_type::bool_type);
2670          emit(OR(temp, op[0], op[1]));
2671          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2672          return;
2673
2674       case ir_binop_logic_and:
2675          temp = vgrf(glsl_type::bool_type);
2676          emit(AND(temp, op[0], op[1]));
2677          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2678          return;
2679
2680       case ir_unop_f2b:
2681          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2682          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2683          return;
2684
2685       case ir_unop_i2b:
2686          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2687          return;
2688
2689       case ir_binop_greater:
2690       case ir_binop_gequal:
2691       case ir_binop_less:
2692       case ir_binop_lequal:
2693       case ir_binop_equal:
2694       case ir_binop_all_equal:
2695       case ir_binop_nequal:
2696       case ir_binop_any_nequal:
2697          if (brw->gen <= 5) {
2698             resolve_bool_comparison(expr->operands[0], &op[0]);
2699             resolve_bool_comparison(expr->operands[1], &op[1]);
2700          }
2701
2702          emit(IF(op[0], op[1],
2703                  brw_conditional_for_comparison(expr->operation)));
2704          return;
2705
2706       case ir_triop_csel: {
2707          /* Expand the boolean condition into the flag register. */
2708          fs_inst *inst = emit(MOV(reg_null_d, op[0]));
2709          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2710
2711          /* Select which boolean to use as the result. */
2712          fs_reg temp = vgrf(expr->operands[1]->type);
2713          inst = emit(SEL(temp, op[1], op[2]));
2714          inst->predicate = BRW_PREDICATE_NORMAL;
2715
2716          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2717          return;
2718       }
2719
2720       default:
2721          unreachable("not reached");
2722       }
2723    }
2724
2725    ir->condition->accept(this);
2726    emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
2727 }
2728
2729 bool
2730 fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
2731 {
2732    ir_dereference_variable *deref = ir->condition->as_dereference_variable();
2733    if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
2734       return false;
2735
2736    if (ir->then_instructions.length() != 1 ||
2737        ir->else_instructions.length() != 1)
2738       return false;
2739
2740    ir_assignment *then_assign =
2741          ((ir_instruction *)ir->then_instructions.head)->as_assignment();
2742    ir_assignment *else_assign =
2743          ((ir_instruction *)ir->else_instructions.head)->as_assignment();
2744
2745    if (!then_assign || then_assign->condition ||
2746        !else_assign || else_assign->condition ||
2747        then_assign->write_mask != else_assign->write_mask ||
2748        !then_assign->lhs->equals(else_assign->lhs))
2749       return false;
2750
2751    ir_constant *then_rhs = then_assign->rhs->as_constant();
2752    ir_constant *else_rhs = else_assign->rhs->as_constant();
2753
2754    if (!then_rhs || !else_rhs)
2755       return false;
2756
2757    if ((then_rhs->is_one() || then_rhs->is_negative_one()) &&
2758        (else_rhs->is_one() || else_rhs->is_negative_one())) {
2759       assert(then_rhs->is_one() == else_rhs->is_negative_one());
2760       assert(else_rhs->is_one() == then_rhs->is_negative_one());
2761
2762       then_assign->lhs->accept(this);
2763       fs_reg dst = this->result;
2764       dst.type = BRW_REGISTER_TYPE_D;
2765       fs_reg tmp = vgrf(glsl_type::int_type);
2766
2767       if (brw->gen >= 6) {
2768          /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
2769          fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
2770
2771          /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
2772           *
2773           *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
2774           *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
2775           *
2776           * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
2777           */
2778
2779          if (then_rhs->is_negative_one()) {
2780             assert(else_rhs->is_one());
2781             g0.negate = true;
2782          }
2783
2784          tmp.type = BRW_REGISTER_TYPE_W;
2785          tmp.subreg_offset = 2;
2786          tmp.stride = 2;
2787
2788          fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
2789          or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
2790
2791          tmp.type = BRW_REGISTER_TYPE_D;
2792          tmp.subreg_offset = 0;
2793          tmp.stride = 1;
2794       } else {
2795          /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
2796          fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
2797
2798          /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
2799           *
2800           *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
2801           *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
2802           *
2803           * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
2804           */
2805
2806          if (then_rhs->is_negative_one()) {
2807             assert(else_rhs->is_one());
2808             g1_6.negate = true;
2809          }
2810
2811          emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
2812       }
2813       emit(AND(dst, tmp, fs_reg(0xbf800000)));
2814       return true;
2815    }
2816
2817    return false;
2818 }
2819
2820 /**
2821  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2822  *
2823  * Many GLSL shaders contain the following pattern:
2824  *
2825  *    x = condition ? foo : bar
2826  *
2827  * The compiler emits an ir_if tree for this, since each subexpression might be
2828  * a complex tree that could have side-effects or short-circuit logic.
2829  *
2830  * However, the common case is to simply select one of two constants or
2831  * variable values---which is exactly what SEL is for.  In this case, the
2832  * assembly looks like:
2833  *
2834  *    (+f0) IF
2835  *    MOV dst src0
2836  *    ELSE
2837  *    MOV dst src1
2838  *    ENDIF
2839  *
2840  * which can be easily translated into:
2841  *
2842  *    (+f0) SEL dst src0 src1
2843  *
2844  * If src0 is an immediate value, we promote it to a temporary GRF.
2845  */
2846 bool
2847 fs_visitor::try_replace_with_sel()
2848 {
2849    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2850    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2851
2852    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2853    int opcodes[] = {
2854       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
2855    };
2856
2857    fs_inst *match = (fs_inst *) endif_inst->prev;
2858    for (int i = 0; i < 4; i++) {
2859       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
2860          return false;
2861       match = (fs_inst *) match->prev;
2862    }
2863
2864    /* The opcodes match; it looks like the right sequence of instructions. */
2865    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
2866    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
2867    fs_inst *if_inst = (fs_inst *) then_mov->prev;
2868
2869    /* Check that the MOVs are the right form. */
2870    if (then_mov->dst.equals(else_mov->dst) &&
2871        !then_mov->is_partial_write() &&
2872        !else_mov->is_partial_write()) {
2873
2874       /* Remove the matched instructions; we'll emit a SEL to replace them. */
2875       while (!if_inst->next->is_tail_sentinel())
2876          if_inst->next->exec_node::remove();
2877       if_inst->exec_node::remove();
2878
2879       /* Only the last source register can be a constant, so if the MOV in
2880        * the "then" clause uses a constant, we need to put it in a temporary.
2881        */
2882       fs_reg src0(then_mov->src[0]);
2883       if (src0.file == IMM) {
2884          src0 = vgrf(glsl_type::float_type);
2885          src0.type = then_mov->src[0].type;
2886          emit(MOV(src0, then_mov->src[0]));
2887       }
2888
2889       fs_inst *sel;
2890       if (if_inst->conditional_mod) {
2891          /* Sandybridge-specific IF with embedded comparison */
2892          emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
2893                   if_inst->conditional_mod));
2894          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2895          sel->predicate = BRW_PREDICATE_NORMAL;
2896       } else {
2897          /* Separate CMP and IF instructions */
2898          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2899          sel->predicate = if_inst->predicate;
2900          sel->predicate_inverse = if_inst->predicate_inverse;
2901       }
2902
2903       return true;
2904    }
2905
2906    return false;
2907 }
2908
2909 void
2910 fs_visitor::visit(ir_if *ir)
2911 {
2912    if (try_opt_frontfacing_ternary(ir))
2913       return;
2914
2915    /* Don't point the annotation at the if statement, because then it plus
2916     * the then and else blocks get printed.
2917     */
2918    this->base_ir = ir->condition;
2919
2920    if (brw->gen == 6) {
2921       emit_if_gen6(ir);
2922    } else {
2923       emit_bool_to_cond_code(ir->condition);
2924
2925       emit(IF(BRW_PREDICATE_NORMAL));
2926    }
2927
2928    foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
2929       this->base_ir = ir_;
2930       ir_->accept(this);
2931    }
2932
2933    if (!ir->else_instructions.is_empty()) {
2934       emit(BRW_OPCODE_ELSE);
2935
2936       foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
2937          this->base_ir = ir_;
2938          ir_->accept(this);
2939       }
2940    }
2941
2942    emit(BRW_OPCODE_ENDIF);
2943
2944    if (!try_replace_with_sel() && brw->gen < 6) {
2945       no16("Can't support (non-uniform) control flow on SIMD16\n");
2946    }
2947 }
2948
2949 void
2950 fs_visitor::visit(ir_loop *ir)
2951 {
2952    if (brw->gen < 6) {
2953       no16("Can't support (non-uniform) control flow on SIMD16\n");
2954    }
2955
2956    this->base_ir = NULL;
2957    emit(BRW_OPCODE_DO);
2958
2959    foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
2960       this->base_ir = ir_;
2961       ir_->accept(this);
2962    }
2963
2964    this->base_ir = NULL;
2965    emit(BRW_OPCODE_WHILE);
2966 }
2967
2968 void
2969 fs_visitor::visit(ir_loop_jump *ir)
2970 {
2971    switch (ir->mode) {
2972    case ir_loop_jump::jump_break:
2973       emit(BRW_OPCODE_BREAK);
2974       break;
2975    case ir_loop_jump::jump_continue:
2976       emit(BRW_OPCODE_CONTINUE);
2977       break;
2978    }
2979 }
2980
2981 void
2982 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2983 {
2984    ir_dereference *deref = static_cast<ir_dereference *>(
2985       ir->actual_parameters.get_head());
2986    ir_variable *location = deref->variable_referenced();
2987    unsigned surf_index = (stage_prog_data->binding_table.abo_start +
2988                           location->data.binding);
2989
2990    /* Calculate the surface offset */
2991    fs_reg offset = vgrf(glsl_type::uint_type);
2992    ir_dereference_array *deref_array = deref->as_dereference_array();
2993
2994    if (deref_array) {
2995       deref_array->array_index->accept(this);
2996
2997       fs_reg tmp = vgrf(glsl_type::uint_type);
2998       emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
2999       emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
3000    } else {
3001       offset = fs_reg(location->data.atomic.offset);
3002    }
3003
3004    /* Emit the appropriate machine instruction */
3005    const char *callee = ir->callee->function_name();
3006    ir->return_deref->accept(this);
3007    fs_reg dst = this->result;
3008
3009    if (!strcmp("__intrinsic_atomic_read", callee)) {
3010       emit_untyped_surface_read(surf_index, dst, offset);
3011
3012    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
3013       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
3014                           fs_reg(), fs_reg());
3015
3016    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
3017       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
3018                           fs_reg(), fs_reg());
3019    }
3020 }
3021
3022 void
3023 fs_visitor::visit(ir_call *ir)
3024 {
3025    const char *callee = ir->callee->function_name();
3026
3027    if (!strcmp("__intrinsic_atomic_read", callee) ||
3028        !strcmp("__intrinsic_atomic_increment", callee) ||
3029        !strcmp("__intrinsic_atomic_predecrement", callee)) {
3030       visit_atomic_counter_intrinsic(ir);
3031    } else {
3032       unreachable("Unsupported intrinsic.");
3033    }
3034 }
3035
3036 void
3037 fs_visitor::visit(ir_return *)
3038 {
3039    unreachable("FINISHME");
3040 }
3041
3042 void
3043 fs_visitor::visit(ir_function *ir)
3044 {
3045    /* Ignore function bodies other than main() -- we shouldn't see calls to
3046     * them since they should all be inlined before we get to ir_to_mesa.
3047     */
3048    if (strcmp(ir->name, "main") == 0) {
3049       const ir_function_signature *sig;
3050       exec_list empty;
3051
3052       sig = ir->matching_signature(NULL, &empty, false);
3053
3054       assert(sig);
3055
3056       foreach_in_list(ir_instruction, ir_, &sig->body) {
3057          this->base_ir = ir_;
3058          ir_->accept(this);
3059       }
3060    }
3061 }
3062
3063 void
3064 fs_visitor::visit(ir_function_signature *)
3065 {
3066    unreachable("not reached");
3067 }
3068
3069 void
3070 fs_visitor::visit(ir_emit_vertex *)
3071 {
3072    unreachable("not reached");
3073 }
3074
3075 void
3076 fs_visitor::visit(ir_end_primitive *)
3077 {
3078    unreachable("not reached");
3079 }
3080
3081 void
3082 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3083                                 fs_reg dst, fs_reg offset, fs_reg src0,
3084                                 fs_reg src1)
3085 {
3086    int reg_width = dispatch_width / 8;
3087    int length = 0;
3088
3089    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
3090
3091    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3092    /* Initialize the sample mask in the message header. */
3093    emit(MOV(sources[0], fs_reg(0u)))
3094       ->force_writemask_all = true;
3095
3096    if (stage == MESA_SHADER_FRAGMENT) {
3097       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
3098          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
3099             ->force_writemask_all = true;
3100       } else {
3101          emit(MOV(component(sources[0], 7),
3102                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3103             ->force_writemask_all = true;
3104       }
3105    } else {
3106       /* The execution mask is part of the side-band information sent together with
3107        * the message payload to the data port. It's implicitly ANDed with the sample
3108        * mask sent in the header to compute the actual set of channels that execute
3109        * the atomic operation.
3110        */
3111       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
3112       emit(MOV(component(sources[0], 7),
3113                fs_reg(0xffff)))->force_writemask_all = true;
3114    }
3115    length++;
3116
3117    /* Set the atomic operation offset. */
3118    sources[1] = vgrf(glsl_type::uint_type);
3119    emit(MOV(sources[1], offset));
3120    length++;
3121
3122    /* Set the atomic operation arguments. */
3123    if (src0.file != BAD_FILE) {
3124       sources[length] = vgrf(glsl_type::uint_type);
3125       emit(MOV(sources[length], src0));
3126       length++;
3127    }
3128
3129    if (src1.file != BAD_FILE) {
3130       sources[length] = vgrf(glsl_type::uint_type);
3131       emit(MOV(sources[length], src1));
3132       length++;
3133    }
3134
3135    int mlen = 1 + (length - 1) * reg_width;
3136    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3137                                BRW_REGISTER_TYPE_UD);
3138    emit(LOAD_PAYLOAD(src_payload, sources, length));
3139
3140    /* Emit the instruction. */
3141    fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
3142                         fs_reg(atomic_op), fs_reg(surf_index));
3143    inst->mlen = mlen;
3144 }
3145
3146 void
3147 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
3148                                       fs_reg offset)
3149 {
3150    int reg_width = dispatch_width / 8;
3151
3152    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
3153
3154    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3155    /* Initialize the sample mask in the message header. */
3156    emit(MOV(sources[0], fs_reg(0u)))
3157       ->force_writemask_all = true;
3158
3159    if (stage == MESA_SHADER_FRAGMENT) {
3160       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
3161          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
3162             ->force_writemask_all = true;
3163       } else {
3164          emit(MOV(component(sources[0], 7),
3165                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3166             ->force_writemask_all = true;
3167       }
3168    } else {
3169       /* The execution mask is part of the side-band information sent together with
3170        * the message payload to the data port. It's implicitly ANDed with the sample
3171        * mask sent in the header to compute the actual set of channels that execute
3172        * the atomic operation.
3173        */
3174       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
3175       emit(MOV(component(sources[0], 7),
3176                fs_reg(0xffff)))->force_writemask_all = true;
3177    }
3178
3179    /* Set the surface read offset. */
3180    sources[1] = vgrf(glsl_type::uint_type);
3181    emit(MOV(sources[1], offset));
3182
3183    int mlen = 1 + reg_width;
3184    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3185                                BRW_REGISTER_TYPE_UD);
3186    fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
3187
3188    /* Emit the instruction. */
3189    inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
3190                fs_reg(surf_index));
3191    inst->mlen = mlen;
3192 }
3193
3194 fs_inst *
3195 fs_visitor::emit(fs_inst *inst)
3196 {
3197    if (dispatch_width == 16 && inst->exec_size == 8)
3198       inst->force_uncompressed = true;
3199
3200    inst->annotation = this->current_annotation;
3201    inst->ir = this->base_ir;
3202
3203    this->instructions.push_tail(inst);
3204
3205    return inst;
3206 }
3207
3208 void
3209 fs_visitor::emit(exec_list list)
3210 {
3211    foreach_in_list_safe(fs_inst, inst, &list) {
3212       inst->exec_node::remove();
3213       emit(inst);
3214    }
3215 }
3216
3217 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
3218 void
3219 fs_visitor::emit_dummy_fs()
3220 {
3221    int reg_width = dispatch_width / 8;
3222
3223    /* Everyone's favorite color. */
3224    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
3225    for (int i = 0; i < 4; i++) {
3226       emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
3227                       dispatch_width), fs_reg(color[i])));
3228    }
3229
3230    fs_inst *write;
3231    write = emit(FS_OPCODE_FB_WRITE);
3232    write->eot = true;
3233    if (brw->gen >= 6) {
3234       write->base_mrf = 2;
3235       write->mlen = 4 * reg_width;
3236    } else {
3237       write->header_present = true;
3238       write->base_mrf = 0;
3239       write->mlen = 2 + 4 * reg_width;
3240    }
3241
3242    /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
3243     * varying to avoid GPU hangs, so set that.
3244     */
3245    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3246    wm_prog_data->num_varying_inputs = brw->gen < 6 ? 1 : 0;
3247    memset(wm_prog_data->urb_setup, -1,
3248           sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
3249
3250    /* We don't have any uniforms. */
3251    stage_prog_data->nr_params = 0;
3252    stage_prog_data->nr_pull_params = 0;
3253    stage_prog_data->curb_read_length = 0;
3254    stage_prog_data->dispatch_grf_start_reg = 2;
3255    wm_prog_data->dispatch_grf_start_reg_16 = 2;
3256    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
3257
3258    calculate_cfg();
3259 }
3260
3261 /* The register location here is relative to the start of the URB
3262  * data.  It will get adjusted to be a real location before
3263  * generate_code() time.
3264  */
3265 struct brw_reg
3266 fs_visitor::interp_reg(int location, int channel)
3267 {
3268    assert(stage == MESA_SHADER_FRAGMENT);
3269    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3270    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
3271    int stride = (channel & 1) * 4;
3272
3273    assert(prog_data->urb_setup[location] != -1);
3274
3275    return brw_vec1_grf(regnr, stride);
3276 }
3277
3278 /** Emits the interpolation for the varying inputs. */
3279 void
3280 fs_visitor::emit_interpolation_setup_gen4()
3281 {
3282    this->current_annotation = "compute pixel centers";
3283    this->pixel_x = vgrf(glsl_type::uint_type);
3284    this->pixel_y = vgrf(glsl_type::uint_type);
3285    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
3286    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
3287
3288    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
3289    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
3290
3291    this->current_annotation = "compute pixel deltas from v0";
3292    if (brw->has_pln) {
3293       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3294          vgrf(glsl_type::vec2_type);
3295       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3296          offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
3297    } else {
3298       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3299          vgrf(glsl_type::float_type);
3300       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3301          vgrf(glsl_type::float_type);
3302    }
3303    emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3304             this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
3305    emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3306             this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
3307
3308    this->current_annotation = "compute pos.w and 1/pos.w";
3309    /* Compute wpos.w.  It's always in our setup, since it's needed to
3310     * interpolate the other attributes.
3311     */
3312    this->wpos_w = vgrf(glsl_type::float_type);
3313    emit(FS_OPCODE_LINTERP, wpos_w,
3314         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3315         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3316         interp_reg(VARYING_SLOT_POS, 3));
3317    /* Compute the pixel 1/W value from wpos.w. */
3318    this->pixel_w = vgrf(glsl_type::float_type);
3319    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
3320    this->current_annotation = NULL;
3321 }
3322
3323 /** Emits the interpolation for the varying inputs. */
3324 void
3325 fs_visitor::emit_interpolation_setup_gen6()
3326 {
3327    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
3328
3329    /* If the pixel centers end up used, the setup is the same as for gen4. */
3330    this->current_annotation = "compute pixel centers";
3331    fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
3332    fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
3333    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
3334    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
3335    emit(ADD(int_pixel_x,
3336             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
3337             fs_reg(brw_imm_v(0x10101010))));
3338    emit(ADD(int_pixel_y,
3339             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
3340             fs_reg(brw_imm_v(0x11001100))));
3341
3342    /* As of gen6, we can no longer mix float and int sources.  We have
3343     * to turn the integer pixel centers into floats for their actual
3344     * use.
3345     */
3346    this->pixel_x = vgrf(glsl_type::float_type);
3347    this->pixel_y = vgrf(glsl_type::float_type);
3348    emit(MOV(this->pixel_x, int_pixel_x));
3349    emit(MOV(this->pixel_y, int_pixel_y));
3350
3351    this->current_annotation = "compute pos.w";
3352    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
3353    this->wpos_w = vgrf(glsl_type::float_type);
3354    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
3355
3356    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3357       uint8_t reg = payload.barycentric_coord_reg[i];
3358       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
3359       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
3360    }
3361
3362    this->current_annotation = NULL;
3363 }
3364
3365 int
3366 fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components)
3367 {
3368    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3369    fs_inst *inst;
3370
3371    if (color.file == BAD_FILE) {
3372       return 4 * (dispatch_width / 8);
3373    }
3374
3375    uint8_t colors_enabled;
3376    if (components == 0) {
3377       /* We want to write one component to the alpha channel */
3378       colors_enabled = 0x8;
3379    } else {
3380       /* Enable the first components-many channels */
3381       colors_enabled = (1 << components) - 1;
3382    }
3383
3384    if (dispatch_width == 8 || brw->gen >= 6) {
3385       /* SIMD8 write looks like:
3386        * m + 0: r0
3387        * m + 1: r1
3388        * m + 2: g0
3389        * m + 3: g1
3390        *
3391        * gen6 SIMD16 DP write looks like:
3392        * m + 0: r0
3393        * m + 1: r1
3394        * m + 2: g0
3395        * m + 3: g1
3396        * m + 4: b0
3397        * m + 5: b1
3398        * m + 6: a0
3399        * m + 7: a1
3400        */
3401       int len = 0;
3402       for (unsigned i = 0; i < 4; ++i) {
3403          if (colors_enabled & (1 << i)) {
3404             dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
3405                               color.type, color.width);
3406             inst = emit(MOV(dst[len], offset(color, i)));
3407             inst->saturate = key->clamp_fragment_color;
3408          } else if (color.width == 16) {
3409             /* We need two BAD_FILE slots for a 16-wide color */
3410             len++;
3411          }
3412          len++;
3413       }
3414       return len;
3415    } else {
3416       /* pre-gen6 SIMD16 single source DP write looks like:
3417        * m + 0: r0
3418        * m + 1: g0
3419        * m + 2: b0
3420        * m + 3: a0
3421        * m + 4: r1
3422        * m + 5: g1
3423        * m + 6: b1
3424        * m + 7: a1
3425        */
3426       for (unsigned i = 0; i < 4; ++i) {
3427          if (colors_enabled & (1 << i)) {
3428             dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
3429             inst = emit(MOV(dst[i], half(offset(color, i), 0)));
3430             inst->saturate = key->clamp_fragment_color;
3431
3432             dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
3433             inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
3434             inst->saturate = key->clamp_fragment_color;
3435             inst->force_sechalf = true;
3436          }
3437       }
3438       return 8;
3439    }
3440 }
3441
3442 static enum brw_conditional_mod
3443 cond_for_alpha_func(GLenum func)
3444 {
3445    switch(func) {
3446       case GL_GREATER:
3447          return BRW_CONDITIONAL_G;
3448       case GL_GEQUAL:
3449          return BRW_CONDITIONAL_GE;
3450       case GL_LESS:
3451          return BRW_CONDITIONAL_L;
3452       case GL_LEQUAL:
3453          return BRW_CONDITIONAL_LE;
3454       case GL_EQUAL:
3455          return BRW_CONDITIONAL_EQ;
3456       case GL_NOTEQUAL:
3457          return BRW_CONDITIONAL_NEQ;
3458       default:
3459          unreachable("Not reached");
3460    }
3461 }
3462
3463 /**
3464  * Alpha test support for when we compile it into the shader instead
3465  * of using the normal fixed-function alpha test.
3466  */
3467 void
3468 fs_visitor::emit_alpha_test()
3469 {
3470    assert(stage == MESA_SHADER_FRAGMENT);
3471    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3472    this->current_annotation = "Alpha test";
3473
3474    fs_inst *cmp;
3475    if (key->alpha_test_func == GL_ALWAYS)
3476       return;
3477
3478    if (key->alpha_test_func == GL_NEVER) {
3479       /* f0.1 = 0 */
3480       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3481                                       BRW_REGISTER_TYPE_UW));
3482       cmp = emit(CMP(reg_null_f, some_reg, some_reg,
3483                      BRW_CONDITIONAL_NEQ));
3484    } else {
3485       /* RT0 alpha */
3486       fs_reg color = offset(outputs[0], 3);
3487
3488       /* f0.1 &= func(color, ref) */
3489       cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
3490                      cond_for_alpha_func(key->alpha_test_func)));
3491    }
3492    cmp->predicate = BRW_PREDICATE_NORMAL;
3493    cmp->flag_subreg = 1;
3494 }
3495
3496 fs_inst *
3497 fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
3498                                  fs_reg src0_alpha, unsigned components)
3499 {
3500    assert(stage == MESA_SHADER_FRAGMENT);
3501    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3502    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3503
3504    this->current_annotation = "FB write header";
3505    bool header_present = true;
3506    int reg_size = dispatch_width / 8;
3507
3508    /* We can potentially have a message length of up to 15, so we have to set
3509     * base_mrf to either 0 or 1 in order to fit in m0..m15.
3510     */
3511    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
3512    int length = 0;
3513
3514    /* From the Sandy Bridge PRM, volume 4, page 198:
3515     *
3516     *     "Dispatched Pixel Enables. One bit per pixel indicating
3517     *      which pixels were originally enabled when the thread was
3518     *      dispatched. This field is only required for the end-of-
3519     *      thread message and on all dual-source messages."
3520     */
3521    if (brw->gen >= 6 &&
3522        (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
3523        color1.file == BAD_FILE &&
3524        key->nr_color_regions == 1) {
3525       header_present = false;
3526    }
3527
3528    if (header_present)
3529       /* Allocate 2 registers for a header */
3530       length += 2;
3531
3532    if (payload.aa_dest_stencil_reg) {
3533       sources[length] = fs_reg(GRF, alloc.allocate(1));
3534       emit(MOV(sources[length],
3535                fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
3536       length++;
3537    }
3538
3539    prog_data->uses_omask =
3540       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
3541    if (prog_data->uses_omask) {
3542       this->current_annotation = "FB write oMask";
3543       assert(this->sample_mask.file != BAD_FILE);
3544       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
3545        * it's unsinged single words, one vgrf is always 16-wide.
3546        */
3547       sources[length] = fs_reg(GRF, alloc.allocate(1),
3548                                BRW_REGISTER_TYPE_UW, 16);
3549       emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
3550       length++;
3551    }
3552
3553    if (color0.file == BAD_FILE) {
3554       /* Even if there's no color buffers enabled, we still need to send
3555        * alpha out the pipeline to our null renderbuffer to support
3556        * alpha-testing, alpha-to-coverage, and so on.
3557        */
3558       length += setup_color_payload(sources + length, this->outputs[0], 0);
3559    } else if (color1.file == BAD_FILE) {
3560       if (src0_alpha.file != BAD_FILE) {
3561          sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
3562                                   src0_alpha.type, src0_alpha.width);
3563          fs_inst *inst = emit(MOV(sources[length], src0_alpha));
3564          inst->saturate = key->clamp_fragment_color;
3565          length++;
3566       }
3567
3568       length += setup_color_payload(sources + length, color0, components);
3569    } else {
3570       length += setup_color_payload(sources + length, color0, components);
3571       length += setup_color_payload(sources + length, color1, components);
3572    }
3573
3574    if (source_depth_to_render_target) {
3575       if (brw->gen == 6) {
3576          /* For outputting oDepth on gen6, SIMD8 writes have to be
3577           * used.  This would require SIMD8 moves of each half to
3578           * message regs, kind of like pre-gen5 SIMD16 FB writes.
3579           * Just bail on doing so for now.
3580           */
3581          no16("Missing support for simd16 depth writes on gen6\n");
3582       }
3583
3584       sources[length] = vgrf(glsl_type::float_type);
3585       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3586          /* Hand over gl_FragDepth. */
3587          assert(this->frag_depth.file != BAD_FILE);
3588          emit(MOV(sources[length], this->frag_depth));
3589       } else {
3590          /* Pass through the payload depth. */
3591          emit(MOV(sources[length],
3592                   fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
3593       }
3594       length++;
3595    }
3596
3597    if (payload.dest_depth_reg) {
3598       sources[length] = vgrf(glsl_type::float_type);
3599       emit(MOV(sources[length],
3600                fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
3601       length++;
3602    }
3603
3604    fs_inst *load;
3605    fs_inst *write;
3606    if (brw->gen >= 7) {
3607       /* Send from the GRF */
3608       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
3609       load = emit(LOAD_PAYLOAD(payload, sources, length));
3610       payload.reg = alloc.allocate(load->regs_written);
3611       payload.width = dispatch_width;
3612       load->dst = payload;
3613       write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
3614       write->base_mrf = -1;
3615    } else {
3616       /* Send from the MRF */
3617       load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3618                                sources, length));
3619       write = emit(FS_OPCODE_FB_WRITE);
3620       write->exec_size = dispatch_width;
3621       write->base_mrf = 1;
3622    }
3623
3624    write->mlen = load->regs_written;
3625    write->header_present = header_present;
3626    if (prog_data->uses_kill) {
3627       write->predicate = BRW_PREDICATE_NORMAL;
3628       write->flag_subreg = 1;
3629    }
3630    return write;
3631 }
3632
3633 void
3634 fs_visitor::emit_fb_writes()
3635 {
3636    assert(stage == MESA_SHADER_FRAGMENT);
3637    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3638    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3639
3640    if (do_dual_src) {
3641       no16("GL_ARB_blend_func_extended not yet supported in SIMD16.");
3642       if (dispatch_width == 16)
3643          do_dual_src = false;
3644    }
3645
3646    fs_inst *inst;
3647    if (do_dual_src) {
3648       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3649          emit_shader_time_end();
3650
3651       this->current_annotation = ralloc_asprintf(this->mem_ctx,
3652                                                  "FB dual-source write");
3653       inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
3654                                   reg_undef, 4);
3655       inst->target = 0;
3656       prog_data->dual_src_blend = true;
3657    } else if (key->nr_color_regions > 0) {
3658       for (int target = 0; target < key->nr_color_regions; target++) {
3659          this->current_annotation = ralloc_asprintf(this->mem_ctx,
3660                                                     "FB write target %d",
3661                                                     target);
3662          fs_reg src0_alpha;
3663          if (brw->gen >= 6 && key->replicate_alpha && target != 0)
3664             src0_alpha = offset(outputs[0], 3);
3665
3666          if (target == key->nr_color_regions - 1 &&
3667              (INTEL_DEBUG & DEBUG_SHADER_TIME))
3668             emit_shader_time_end();
3669
3670          inst = emit_single_fb_write(this->outputs[target], reg_undef,
3671                                      src0_alpha,
3672                                      this->output_components[target]);
3673          inst->target = target;
3674       }
3675    } else {
3676       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3677          emit_shader_time_end();
3678
3679       /* Even if there's no color buffers enabled, we still need to send
3680        * alpha out the pipeline to our null renderbuffer to support
3681        * alpha-testing, alpha-to-coverage, and so on.
3682        */
3683       inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
3684       inst->target = 0;
3685    }
3686
3687    inst->eot = true;
3688    this->current_annotation = NULL;
3689 }
3690
3691 void
3692 fs_visitor::setup_uniform_clipplane_values()
3693 {
3694    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
3695    const struct brw_vue_prog_key *key =
3696       (const struct brw_vue_prog_key *) this->key;
3697
3698    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3699       this->userplane[i] = fs_reg(UNIFORM, uniforms);
3700       for (int j = 0; j < 4; ++j) {
3701          stage_prog_data->param[uniforms + j] =
3702             (gl_constant_value *) &clip_planes[i][j];
3703       }
3704       uniforms += 4;
3705    }
3706 }
3707
3708 void fs_visitor::compute_clip_distance()
3709 {
3710    struct brw_vue_prog_data *vue_prog_data =
3711       (struct brw_vue_prog_data *) prog_data;
3712    const struct brw_vue_prog_key *key =
3713       (const struct brw_vue_prog_key *) this->key;
3714
3715    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3716     *
3717     *     "If a linked set of shaders forming the vertex stage contains no
3718     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3719     *     application has requested clipping against user clip planes through
3720     *     the API, then the coordinate written to gl_Position is used for
3721     *     comparison against the user clip planes."
3722     *
3723     * This function is only called if the shader didn't write to
3724     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3725     * if the user wrote to it; otherwise we use gl_Position.
3726     */
3727
3728    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3729    if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
3730       clip_vertex = VARYING_SLOT_POS;
3731
3732    /* If the clip vertex isn't written, skip this.  Typically this means
3733     * the GS will set up clipping. */
3734    if (outputs[clip_vertex].file == BAD_FILE)
3735       return;
3736
3737    setup_uniform_clipplane_values();
3738
3739    current_annotation = "user clip distances";
3740
3741    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
3742    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
3743
3744    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3745       fs_reg u = userplane[i];
3746       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
3747       output.reg_offset = i & 3;
3748
3749       emit(MUL(output, outputs[clip_vertex], u));
3750       for (int j = 1; j < 4; j++) {
3751          u.reg = userplane[i].reg + j;
3752          emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
3753       }
3754    }
3755 }
3756
3757 void
3758 fs_visitor::emit_urb_writes()
3759 {
3760    int slot, urb_offset, length;
3761    struct brw_vs_prog_data *vs_prog_data =
3762       (struct brw_vs_prog_data *) prog_data;
3763    const struct brw_vs_prog_key *key =
3764       (const struct brw_vs_prog_key *) this->key;
3765    const GLbitfield64 psiz_mask =
3766       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
3767    const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
3768    bool flush;
3769    fs_reg sources[8];
3770
3771    /* Lower legacy ff and ClipVertex clipping to clip distances */
3772    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
3773       compute_clip_distance();
3774
3775    /* If we don't have any valid slots to write, just do a minimal urb write
3776     * send to terminate the shader. */
3777    if (vue_map->slots_valid == 0) {
3778
3779       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3780       fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
3781                                                       BRW_REGISTER_TYPE_UD))));
3782       inst->force_writemask_all = true;
3783
3784       inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
3785       inst->eot = true;
3786       inst->mlen = 1;
3787       inst->offset = 1;
3788       return;
3789    }
3790
3791    length = 0;
3792    urb_offset = 0;
3793    flush = false;
3794    for (slot = 0; slot < vue_map->num_slots; slot++) {
3795       fs_reg reg, src, zero;
3796
3797       int varying = vue_map->slot_to_varying[slot];
3798       switch (varying) {
3799       case VARYING_SLOT_PSIZ:
3800
3801          /* The point size varying slot is the vue header and is always in the
3802           * vue map.  But often none of the special varyings that live there
3803           * are written and in that case we can skip writing to the vue
3804           * header, provided the corresponding state properly clamps the
3805           * values further down the pipeline. */
3806          if ((vue_map->slots_valid & psiz_mask) == 0) {
3807             assert(length == 0);
3808             urb_offset++;
3809             break;
3810          }
3811
3812          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3813          emit(MOV(zero, fs_reg(0u)));
3814
3815          sources[length++] = zero;
3816          if (vue_map->slots_valid & VARYING_BIT_LAYER)
3817             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
3818          else
3819             sources[length++] = zero;
3820
3821          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
3822             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
3823          else
3824             sources[length++] = zero;
3825
3826          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
3827             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
3828          else
3829             sources[length++] = zero;
3830          break;
3831
3832       case BRW_VARYING_SLOT_NDC:
3833       case VARYING_SLOT_EDGE:
3834          unreachable("unexpected scalar vs output");
3835          break;
3836
3837       case BRW_VARYING_SLOT_PAD:
3838          break;
3839
3840       default:
3841          /* gl_Position is always in the vue map, but isn't always written by
3842           * the shader.  Other varyings (clip distances) get added to the vue
3843           * map but don't always get written.  In those cases, the
3844           * corresponding this->output[] slot will be invalid we and can skip
3845           * the urb write for the varying.  If we've already queued up a vue
3846           * slot for writing we flush a mlen 5 urb write, otherwise we just
3847           * advance the urb_offset.
3848           */
3849          if (this->outputs[varying].file == BAD_FILE) {
3850             if (length > 0)
3851                flush = true;
3852             else
3853                urb_offset++;
3854             break;
3855          }
3856
3857          if ((varying == VARYING_SLOT_COL0 ||
3858               varying == VARYING_SLOT_COL1 ||
3859               varying == VARYING_SLOT_BFC0 ||
3860               varying == VARYING_SLOT_BFC1) &&
3861              key->clamp_vertex_color) {
3862             /* We need to clamp these guys, so do a saturating MOV into a
3863              * temp register and use that for the payload.
3864              */
3865             for (int i = 0; i < 4; i++) {
3866                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
3867                src = offset(this->outputs[varying], i);
3868                fs_inst *inst = emit(MOV(reg, src));
3869                inst->saturate = true;
3870                sources[length++] = reg;
3871             }
3872          } else {
3873             for (int i = 0; i < 4; i++)
3874                sources[length++] = offset(this->outputs[varying], i);
3875          }
3876          break;
3877       }
3878
3879       current_annotation = "URB write";
3880
3881       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
3882        * the last slot or if we need to flush (see BAD_FILE varying case
3883        * above), emit a URB write send now to flush out the data.
3884        */
3885       int last = slot == vue_map->num_slots - 1;
3886       if (length == 8 || last)
3887          flush = true;
3888       if (flush) {
3889          if (last && (INTEL_DEBUG & DEBUG_SHADER_TIME))
3890             emit_shader_time_end();
3891
3892          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
3893          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
3894                                  BRW_REGISTER_TYPE_F);
3895
3896          /* We need WE_all on the MOV for the message header (the URB handles)
3897           * so do a MOV to a dummy register and set force_writemask_all on the
3898           * MOV.  LOAD_PAYLOAD will preserve that.
3899           */
3900          fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
3901                                BRW_REGISTER_TYPE_UD);
3902          fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
3903                                                        BRW_REGISTER_TYPE_UD))));
3904          inst->force_writemask_all = true;
3905          payload_sources[0] = dummy;
3906
3907          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
3908          emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
3909
3910          inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
3911          inst->eot = last;
3912          inst->mlen = length + 1;
3913          inst->offset = urb_offset;
3914          urb_offset = slot + 1;
3915          length = 0;
3916          flush = false;
3917       }
3918    }
3919 }
3920
3921 void
3922 fs_visitor::resolve_ud_negate(fs_reg *reg)
3923 {
3924    if (reg->type != BRW_REGISTER_TYPE_UD ||
3925        !reg->negate)
3926       return;
3927
3928    fs_reg temp = vgrf(glsl_type::uint_type);
3929    emit(MOV(temp, *reg));
3930    *reg = temp;
3931 }
3932
3933 /**
3934  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3935  *
3936  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3937  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3938  */
3939 void
3940 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
3941 {
3942    assert(brw->gen <= 5);
3943
3944    if (rvalue->type != glsl_type::bool_type)
3945       return;
3946
3947    fs_reg and_result = vgrf(glsl_type::bool_type);
3948    fs_reg neg_result = vgrf(glsl_type::bool_type);
3949    emit(AND(and_result, *reg, fs_reg(1)));
3950    emit(MOV(neg_result, negate(and_result)));
3951    *reg = neg_result;
3952 }
3953
3954 fs_visitor::fs_visitor(struct brw_context *brw,
3955                        void *mem_ctx,
3956                        const struct brw_wm_prog_key *key,
3957                        struct brw_wm_prog_data *prog_data,
3958                        struct gl_shader_program *shader_prog,
3959                        struct gl_fragment_program *fp,
3960                        unsigned dispatch_width)
3961    : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
3962                      MESA_SHADER_FRAGMENT),
3963      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
3964      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
3965      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
3966      key(key), prog_data(&prog_data->base),
3967      dispatch_width(dispatch_width)
3968 {
3969    this->mem_ctx = mem_ctx;
3970    init();
3971 }
3972
3973 fs_visitor::fs_visitor(struct brw_context *brw,
3974                        void *mem_ctx,
3975                        const struct brw_vs_prog_key *key,
3976                        struct brw_vs_prog_data *prog_data,
3977                        struct gl_shader_program *shader_prog,
3978                        struct gl_vertex_program *cp,
3979                        unsigned dispatch_width)
3980    : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
3981                      MESA_SHADER_VERTEX),
3982      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
3983      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
3984      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
3985      key(key), prog_data(&prog_data->base.base),
3986      dispatch_width(dispatch_width)
3987 {
3988    this->mem_ctx = mem_ctx;
3989    init();
3990 }
3991
3992 void
3993 fs_visitor::init()
3994 {
3995    this->failed = false;
3996    this->simd16_unsupported = false;
3997    this->no16_msg = NULL;
3998    this->variable_ht = hash_table_ctor(0,
3999                                        hash_table_pointer_hash,
4000                                        hash_table_pointer_compare);
4001
4002    this->nir_locals = NULL;
4003    this->nir_globals = NULL;
4004
4005    memset(&this->payload, 0, sizeof(this->payload));
4006    memset(this->outputs, 0, sizeof(this->outputs));
4007    memset(this->output_components, 0, sizeof(this->output_components));
4008    this->source_depth_to_render_target = false;
4009    this->runtime_check_aads_emit = false;
4010    this->first_non_payload_grf = 0;
4011    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
4012
4013    this->current_annotation = NULL;
4014    this->base_ir = NULL;
4015
4016    this->virtual_grf_start = NULL;
4017    this->virtual_grf_end = NULL;
4018    this->live_intervals = NULL;
4019    this->regs_live_at_ip = NULL;
4020
4021    this->uniforms = 0;
4022    this->last_scratch = 0;
4023    this->pull_constant_loc = NULL;
4024    this->push_constant_loc = NULL;
4025
4026    this->spilled_any_registers = false;
4027    this->do_dual_src = false;
4028
4029    if (dispatch_width == 8)
4030       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
4031 }
4032
4033 fs_visitor::~fs_visitor()
4034 {
4035    hash_table_dtor(this->variable_ht);
4036 }