src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "program/prog_parameter.h"
  35 #include "program/prog_print.h"
  36 #include "program/prog_optimize.h"
  37 #include "util/register_allocate.h"
  38 #include "program/hash_table.h"
  39 #include "brw_context.h"
  40 #include "brw_eu.h"
  41 #include "brw_wm.h"
  42 #include "brw_vec4.h"
  43 #include "brw_fs.h"
  44 #include "main/uniforms.h"
  45 #include "glsl/glsl_types.h"
  46 #include "glsl/ir_optimization.h"
  47 #include "program/sampler.h"
  48
  49
  50 fs_reg *
  51 fs_visitor::emit_vs_system_value(int location)
  52 {
  53    fs_reg *reg = new(this->mem_ctx)
  54       fs_reg(ATTR, VERT_ATTRIB_MAX, BRW_REGISTER_TYPE_D);
  55    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  56
  57    switch (location) {
  58    case SYSTEM_VALUE_BASE_VERTEX:
  59       reg->reg_offset = 0;
  60       vs_prog_data->uses_vertexid = true;
  61       break;
  62    case SYSTEM_VALUE_VERTEX_ID:
  63    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  64       reg->reg_offset = 2;
  65       vs_prog_data->uses_vertexid = true;
  66       break;
  67    case SYSTEM_VALUE_INSTANCE_ID:
  68       reg->reg_offset = 3;
  69       vs_prog_data->uses_instanceid = true;
  70       break;
  71    default:
  72       unreachable("not reached");
  73    }
  74
  75    return reg;
  76 }
  77
  78 void
  79 fs_visitor::visit(ir_variable *ir)
  80 {
  81    fs_reg *reg = NULL;
  82
  83    if (variable_storage(ir))
  84       return;
  85
  86    if (ir->data.mode == ir_var_shader_in) {
  87       assert(ir->data.location != -1);
  88       if (stage == MESA_SHADER_VERTEX) {
  89          reg = new(this->mem_ctx)
  90             fs_reg(ATTR, ir->data.location,
  91                    brw_type_for_base_type(ir->type->get_scalar_type()));
  92       } else if (ir->data.location == VARYING_SLOT_POS) {
  93          reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
  94                                             ir->data.origin_upper_left);
  95       } else if (ir->data.location == VARYING_SLOT_FACE) {
  96          reg = emit_frontfacing_interpolation();
  97       } else {
  98          reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
  99          emit_general_interpolation(*reg, ir->name, ir->type,
 100                                     (glsl_interp_qualifier) ir->data.interpolation,
 101                                     ir->data.location, ir->data.centroid,
 102                                     ir->data.sample);
 103       }
 104       assert(reg);
 105       hash_table_insert(this->variable_ht, reg, ir);
 106       return;
 107    } else if (ir->data.mode == ir_var_shader_out) {
 108       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 109
 110       if (stage == MESA_SHADER_VERTEX) {
 111          int vector_elements =
 112             ir->type->is_array() ? ir->type->fields.array->vector_elements
 113                                  : ir->type->vector_elements;
 114
 115          for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
 116             int output = ir->data.location + i;
 117             this->outputs[output] = *reg;
 118             this->outputs[output].reg_offset = i * 4;
 119             this->output_components[output] = vector_elements;
 120          }
 121
 122       } else if (ir->data.index > 0) {
 123          assert(ir->data.location == FRAG_RESULT_DATA0);
 124          assert(ir->data.index == 1);
 125          this->dual_src_output = *reg;
 126          this->do_dual_src = true;
 127       } else if (ir->data.location == FRAG_RESULT_COLOR) {
 128          /* Writing gl_FragColor outputs to all color regions. */
 129          assert(stage == MESA_SHADER_FRAGMENT);
 130          brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 131          for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
 132             this->outputs[i] = *reg;
 133             this->output_components[i] = 4;
 134          }
 135       } else if (ir->data.location == FRAG_RESULT_DEPTH) {
 136          this->frag_depth = *reg;
 137       } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
 138          this->sample_mask = *reg;
 139       } else {
 140          /* gl_FragData or a user-defined FS output */
 141          assert(ir->data.location >= FRAG_RESULT_DATA0 &&
 142                 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
 143
 144          int vector_elements =
 145             ir->type->is_array() ? ir->type->fields.array->vector_elements
 146                                  : ir->type->vector_elements;
 147
 148          /* General color output. */
 149          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
 150             int output = ir->data.location - FRAG_RESULT_DATA0 + i;
 151             this->outputs[output] = offset(*reg, vector_elements * i);
 152             this->output_components[output] = vector_elements;
 153          }
 154       }
 155    } else if (ir->data.mode == ir_var_uniform) {
 156       int param_index = uniforms;
 157
 158       /* Thanks to the lower_ubo_reference pass, we will see only
 159        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 160        * variables, so no need for them to be in variable_ht.
 161        *
 162        * Some uniforms, such as samplers and atomic counters, have no actual
 163        * storage, so we should ignore them.
 164        */
 165       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
 166          return;
 167
 168       if (dispatch_width == 16) {
 169          if (!variable_storage(ir)) {
 170             fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
 171          }
 172          return;
 173       }
 174
 175       param_size[param_index] = type_size(ir->type);
 176       if (!strncmp(ir->name, "gl_", 3)) {
 177          setup_builtin_uniform_values(ir);
 178       } else {
 179          setup_uniform_values(ir);
 180       }
 181
 182       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 183       reg->type = brw_type_for_base_type(ir->type);
 184
 185    } else if (ir->data.mode == ir_var_system_value) {
 186       switch (ir->data.location) {
 187       case SYSTEM_VALUE_BASE_VERTEX:
 188       case SYSTEM_VALUE_VERTEX_ID:
 189       case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
 190       case SYSTEM_VALUE_INSTANCE_ID:
 191          reg = emit_vs_system_value(ir->data.location);
 192          break;
 193       case SYSTEM_VALUE_SAMPLE_POS:
 194          reg = emit_samplepos_setup();
 195          break;
 196       case SYSTEM_VALUE_SAMPLE_ID:
 197          reg = emit_sampleid_setup();
 198          break;
 199       case SYSTEM_VALUE_SAMPLE_MASK_IN:
 200          assert(brw->gen >= 7);
 201          reg = new(mem_ctx)
 202             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
 203                           BRW_REGISTER_TYPE_D));
 204          break;
 205       }
 206    }
 207
 208    if (!reg)
 209       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 210
 211    hash_table_insert(this->variable_ht, reg, ir);
 212 }
 213
 214 void
 215 fs_visitor::visit(ir_dereference_variable *ir)
 216 {
 217    fs_reg *reg = variable_storage(ir->var);
 218
 219    if (!reg) {
 220       fail("Failed to find variable storage for %s\n", ir->var->name);
 221       this->result = fs_reg(reg_null_d);
 222       return;
 223    }
 224    this->result = *reg;
 225 }
 226
 227 void
 228 fs_visitor::visit(ir_dereference_record *ir)
 229 {
 230    const glsl_type *struct_type = ir->record->type;
 231
 232    ir->record->accept(this);
 233
 234    unsigned int off = 0;
 235    for (unsigned int i = 0; i < struct_type->length; i++) {
 236       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 237          break;
 238       off += type_size(struct_type->fields.structure[i].type);
 239    }
 240    this->result = offset(this->result, off);
 241    this->result.type = brw_type_for_base_type(ir->type);
 242 }
 243
 244 void
 245 fs_visitor::visit(ir_dereference_array *ir)
 246 {
 247    ir_constant *constant_index;
 248    fs_reg src;
 249    int element_size = type_size(ir->type);
 250
 251    constant_index = ir->array_index->as_constant();
 252
 253    ir->array->accept(this);
 254    src = this->result;
 255    src.type = brw_type_for_base_type(ir->type);
 256
 257    if (constant_index) {
 258       if (src.file == ATTR) {
 259          /* Attribute arrays get loaded as one vec4 per element.  In that case
 260           * offset the source register.
 261           */
 262          src.reg += constant_index->value.i[0];
 263       } else {
 264          assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
 265          src = offset(src, constant_index->value.i[0] * element_size);
 266       }
 267    } else {
 268       /* Variable index array dereference.  We attach the variable index
 269        * component to the reg as a pointer to a register containing the
 270        * offset.  Currently only uniform arrays are supported in this patch,
 271        * and that reladdr pointer is resolved by
 272        * move_uniform_array_access_to_pull_constants().  All other array types
 273        * are lowered by lower_variable_index_to_cond_assign().
 274        */
 275       ir->array_index->accept(this);
 276
 277       fs_reg index_reg;
 278       index_reg = vgrf(glsl_type::int_type);
 279       emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
 280
 281       if (src.reladdr) {
 282          emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
 283       }
 284
 285       src.reladdr = ralloc(mem_ctx, fs_reg);
 286       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
 287    }
 288    this->result = src;
 289 }
 290
 291 fs_inst *
 292 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
 293                      const fs_reg &a)
 294 {
 295    if (brw->gen < 6) {
 296       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 297       fs_reg y_times_a           = vgrf(glsl_type::float_type);
 298       fs_reg one_minus_a         = vgrf(glsl_type::float_type);
 299       fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
 300
 301       emit(MUL(y_times_a, y, a));
 302
 303       fs_reg negative_a = a;
 304       negative_a.negate = !a.negate;
 305       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
 306       emit(MUL(x_times_one_minus_a, x, one_minus_a));
 307
 308       return emit(ADD(dst, x_times_one_minus_a, y_times_a));
 309    } else {
 310       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 311        * we need to reorder the operands.
 312        */
 313       return emit(LRP(dst, a, y, x));
 314    }
 315 }
 316
 317 void
 318 fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
 319                         const fs_reg &src0, const fs_reg &src1)
 320 {
 321    assert(conditionalmod == BRW_CONDITIONAL_GE ||
 322           conditionalmod == BRW_CONDITIONAL_L);
 323
 324    fs_inst *inst;
 325
 326    if (brw->gen >= 6) {
 327       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 328       inst->conditional_mod = conditionalmod;
 329    } else {
 330       emit(CMP(reg_null_d, src0, src1, conditionalmod));
 331
 332       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 333       inst->predicate = BRW_PREDICATE_NORMAL;
 334    }
 335 }
 336
 337 bool
 338 fs_visitor::try_emit_saturate(ir_expression *ir)
 339 {
 340    if (ir->operation != ir_unop_saturate)
 341       return false;
 342
 343    ir_rvalue *sat_val = ir->operands[0];
 344
 345    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 346
 347    sat_val->accept(this);
 348    fs_reg src = this->result;
 349
 350    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 351
 352    /* If the last instruction from our accept() generated our
 353     * src, just set the saturate flag instead of emmitting a separate mov.
 354     */
 355    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 356    if (modify && modify->regs_written == modify->dst.width / 8 &&
 357        modify->can_do_saturate()) {
 358       modify->saturate = true;
 359       this->result = src;
 360       return true;
 361    }
 362
 363    return false;
 364 }
 365
 366 bool
 367 fs_visitor::try_emit_line(ir_expression *ir)
 368 {
 369    /* LINE's src0 must be of type float. */
 370    if (ir->type != glsl_type::float_type)
 371       return false;
 372
 373    ir_rvalue *nonmul = ir->operands[1];
 374    ir_expression *mul = ir->operands[0]->as_expression();
 375
 376    if (!mul || mul->operation != ir_binop_mul) {
 377       nonmul = ir->operands[0];
 378       mul = ir->operands[1]->as_expression();
 379
 380       if (!mul || mul->operation != ir_binop_mul)
 381          return false;
 382    }
 383
 384    ir_constant *const_add = nonmul->as_constant();
 385    if (!const_add)
 386       return false;
 387
 388    int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
 389    if (add_operand_vf == -1)
 390       return false;
 391
 392    ir_rvalue *non_const_mul = mul->operands[1];
 393    ir_constant *const_mul = mul->operands[0]->as_constant();
 394    if (!const_mul) {
 395       const_mul = mul->operands[1]->as_constant();
 396
 397       if (!const_mul)
 398          return false;
 399
 400       non_const_mul = mul->operands[0];
 401    }
 402
 403    int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
 404    if (mul_operand_vf == -1)
 405       return false;
 406
 407    non_const_mul->accept(this);
 408    fs_reg src1 = this->result;
 409
 410    fs_reg src0 = vgrf(ir->type);
 411    emit(BRW_OPCODE_MOV, src0,
 412         fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
 413
 414    this->result = vgrf(ir->type);
 415    emit(BRW_OPCODE_LINE, this->result, src0, src1);
 416    return true;
 417 }
 418
 419 bool
 420 fs_visitor::try_emit_mad(ir_expression *ir)
 421 {
 422    /* 3-src instructions were introduced in gen6. */
 423    if (brw->gen < 6)
 424       return false;
 425
 426    /* MAD can only handle floating-point data. */
 427    if (ir->type != glsl_type::float_type)
 428       return false;
 429
 430    ir_rvalue *nonmul;
 431    ir_expression *mul;
 432    bool mul_negate, mul_abs;
 433
 434    for (int i = 0; i < 2; i++) {
 435       mul_negate = false;
 436       mul_abs = false;
 437
 438       mul = ir->operands[i]->as_expression();
 439       nonmul = ir->operands[1 - i];
 440
 441       if (mul && mul->operation == ir_unop_abs) {
 442          mul = mul->operands[0]->as_expression();
 443          mul_abs = true;
 444       } else if (mul && mul->operation == ir_unop_neg) {
 445          mul = mul->operands[0]->as_expression();
 446          mul_negate = true;
 447       }
 448
 449       if (mul && mul->operation == ir_binop_mul)
 450          break;
 451    }
 452
 453    if (!mul || mul->operation != ir_binop_mul)
 454       return false;
 455
 456    nonmul->accept(this);
 457    fs_reg src0 = this->result;
 458
 459    mul->operands[0]->accept(this);
 460    fs_reg src1 = this->result;
 461    src1.negate ^= mul_negate;
 462    src1.abs = mul_abs;
 463    if (mul_abs)
 464       src1.negate = false;
 465
 466    mul->operands[1]->accept(this);
 467    fs_reg src2 = this->result;
 468    src2.abs = mul_abs;
 469    if (mul_abs)
 470       src2.negate = false;
 471
 472    this->result = vgrf(ir->type);
 473    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 474
 475    return true;
 476 }
 477
 478 bool
 479 fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
 480 {
 481    /* On platforms that do not natively generate 0u and ~0u for Boolean
 482     * results, b2f expressions that look like
 483     *
 484     *     f = b2f(expr cmp 0)
 485     *
 486     * will generate better code by pretending the expression is
 487     *
 488     *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
 489     *
 490     * This is because the last instruction of "expr" can generate the
 491     * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
 492     * trick to generate 0u or ~0u for the Boolean result.  This means code like
 493     *
 494     *     mov(16)         g16<1>F         1F
 495     *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
 496     *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
 497     *
 498     * will be generated instead of
 499     *
 500     *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
 501     *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
 502     *     and(16)         g4<1>D          g2<8,8,1>D      1D
 503     *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
 504     *
 505     * When the comparison is != 0.0 using the knowledge that the false case
 506     * already results in zero would allow better code generation by possibly
 507     * avoiding a load-immediate instruction.
 508     */
 509    ir_expression *cmp = ir->operands[0]->as_expression();
 510    if (cmp == NULL)
 511       return false;
 512
 513    if (cmp->operation == ir_binop_nequal) {
 514       for (unsigned i = 0; i < 2; i++) {
 515          ir_constant *c = cmp->operands[i]->as_constant();
 516          if (c == NULL || !c->is_zero())
 517             continue;
 518
 519          ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
 520          if (expr != NULL) {
 521             fs_reg op[2];
 522
 523             for (unsigned j = 0; j < 2; j++) {
 524                cmp->operands[j]->accept(this);
 525                op[j] = this->result;
 526
 527                resolve_ud_negate(&op[j]);
 528             }
 529
 530             emit_bool_to_cond_code_of_reg(cmp, op);
 531
 532             /* In this case we know when the condition is true, op[i ^ 1]
 533              * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
 534              * and immediate 1.0f as src1.
 535              */
 536             this->result = vgrf(ir->type);
 537             op[i ^ 1].type = BRW_REGISTER_TYPE_F;
 538
 539             fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
 540             inst->predicate = BRW_PREDICATE_NORMAL;
 541             inst->predicate_inverse = true;
 542             return true;
 543          }
 544       }
 545    }
 546
 547    emit_bool_to_cond_code(cmp);
 548
 549    fs_reg temp = vgrf(ir->type);
 550    emit(MOV(temp, fs_reg(1.0f)));
 551
 552    this->result = vgrf(ir->type);
 553    fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
 554    inst->predicate = BRW_PREDICATE_NORMAL;
 555
 556    return true;
 557 }
 558
 559 static int
 560 pack_pixel_offset(float x)
 561 {
 562    /* Clamp upper end of the range to +7/16. See explanation in non-constant
 563     * offset case below. */
 564    int n = MIN2((int)(x * 16), 7);
 565    return n & 0xf;
 566 }
 567
 568 void
 569 fs_visitor::emit_interpolate_expression(ir_expression *ir)
 570 {
 571    /* in SIMD16 mode, the pixel interpolator returns coords interleaved
 572     * 8 channels at a time, same as the barycentric coords presented in
 573     * the FS payload. this requires a bit of extra work to support.
 574     */
 575    no16("interpolate_at_* not yet supported in SIMD16 mode.");
 576
 577    assert(stage == MESA_SHADER_FRAGMENT);
 578    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 579
 580    ir_dereference * deref = ir->operands[0]->as_dereference();
 581    ir_swizzle * swiz = NULL;
 582    if (!deref) {
 583       /* the api does not allow a swizzle here, but the varying packing code
 584        * may have pushed one into here.
 585        */
 586       swiz = ir->operands[0]->as_swizzle();
 587       assert(swiz);
 588       deref = swiz->val->as_dereference();
 589    }
 590    assert(deref);
 591    ir_variable * var = deref->variable_referenced();
 592    assert(var);
 593
 594    /* 1. collect interpolation factors */
 595
 596    fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
 597
 598    /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
 599     * even when there is no payload. in the per-slot offset case, we'll replace this with
 600     * the proper source data. */
 601    fs_reg src = vgrf(glsl_type::float_type);
 602    int mlen = 1;     /* one reg unless overriden */
 603    int reg_width = dispatch_width / 8;
 604    fs_inst *inst;
 605
 606    switch (ir->operation) {
 607    case ir_unop_interpolate_at_centroid:
 608       inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
 609       break;
 610
 611    case ir_binop_interpolate_at_sample: {
 612       ir_constant *sample_num = ir->operands[1]->as_constant();
 613       assert(sample_num || !"nonconstant sample number should have been lowered.");
 614
 615       unsigned msg_data = sample_num->value.i[0] << 4;
 616       inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
 617       break;
 618    }
 619
 620    case ir_binop_interpolate_at_offset: {
 621       ir_constant *const_offset = ir->operands[1]->as_constant();
 622       if (const_offset) {
 623          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
 624                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
 625          inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
 626                      fs_reg(msg_data));
 627       } else {
 628          /* pack the operands: hw wants offsets as 4 bit signed ints */
 629          ir->operands[1]->accept(this);
 630          src = vgrf(glsl_type::ivec2_type);
 631          fs_reg src2 = src;
 632          for (int i = 0; i < 2; i++) {
 633             fs_reg temp = vgrf(glsl_type::float_type);
 634             emit(MUL(temp, this->result, fs_reg(16.0f)));
 635             emit(MOV(src2, temp));  /* float to int */
 636
 637             /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
 638              * that we support a maximum offset of +0.5, which isn't representable
 639              * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
 640              * which is the opposite of what the shader author wanted.
 641              *
 642              * This is legal due to ARB_gpu_shader5's quantization rules:
 643              *
 644              * "Not all values of <offset> may be supported; x and y offsets may
 645              * be rounded to fixed-point values with the number of fraction bits
 646              * given by the implementation-dependent constant
 647              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
 648              */
 649
 650             fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
 651             inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
 652
 653             src2 = offset(src2, 1);
 654             this->result = offset(this->result, 1);
 655          }
 656
 657          mlen = 2 * reg_width;
 658          inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
 659                      fs_reg(0u));
 660       }
 661       break;
 662    }
 663
 664    default:
 665       unreachable("not reached");
 666    }
 667
 668    inst->mlen = mlen;
 669    inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
 670    inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
 671          INTERP_QUALIFIER_NOPERSPECTIVE;
 672
 673    /* 2. emit linterp */
 674
 675    fs_reg res = vgrf(ir->type);
 676    this->result = res;
 677
 678    for (int i = 0; i < ir->type->vector_elements; i++) {
 679       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
 680       emit(FS_OPCODE_LINTERP, res, dst_xy,
 681            fs_reg(interp_reg(var->data.location, ch)));
 682       res = offset(res, 1);
 683    }
 684 }
 685
 686 void
 687 fs_visitor::visit(ir_expression *ir)
 688 {
 689    unsigned int operand;
 690    fs_reg op[3], temp;
 691    fs_inst *inst;
 692    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 693
 694    assert(ir->get_num_operands() <= 3);
 695
 696    if (try_emit_saturate(ir))
 697       return;
 698
 699    /* Deal with the real oddball stuff first */
 700    switch (ir->operation) {
 701    case ir_binop_add:
 702       if (brw->gen <= 5 && try_emit_line(ir))
 703          return;
 704       if (try_emit_mad(ir))
 705          return;
 706       break;
 707
 708    case ir_triop_csel:
 709       ir->operands[1]->accept(this);
 710       op[1] = this->result;
 711       ir->operands[2]->accept(this);
 712       op[2] = this->result;
 713
 714       emit_bool_to_cond_code(ir->operands[0]);
 715
 716       this->result = vgrf(ir->type);
 717       inst = emit(SEL(this->result, op[1], op[2]));
 718       inst->predicate = BRW_PREDICATE_NORMAL;
 719       return;
 720
 721    case ir_unop_b2f:
 722       if (brw->gen <= 5 && try_emit_b2f_of_comparison(ir))
 723          return;
 724       break;
 725
 726    case ir_unop_interpolate_at_centroid:
 727    case ir_binop_interpolate_at_offset:
 728    case ir_binop_interpolate_at_sample:
 729       emit_interpolate_expression(ir);
 730       return;
 731
 732    default:
 733       break;
 734    }
 735
 736    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 737       ir->operands[operand]->accept(this);
 738       if (this->result.file == BAD_FILE) {
 739          fail("Failed to get tree for expression operand:\n");
 740          ir->operands[operand]->fprint(stderr);
 741          fprintf(stderr, "\n");
 742       }
 743       assert(this->result.file == GRF ||
 744              this->result.file == UNIFORM || this->result.file == ATTR);
 745       op[operand] = this->result;
 746
 747       /* Matrix expression operands should have been broken down to vector
 748        * operations already.
 749        */
 750       assert(!ir->operands[operand]->type->is_matrix());
 751       /* And then those vector operands should have been broken down to scalar.
 752        */
 753       assert(!ir->operands[operand]->type->is_vector());
 754    }
 755
 756    /* Storage for our result.  If our result goes into an assignment, it will
 757     * just get copy-propagated out, so no worries.
 758     */
 759    this->result = vgrf(ir->type);
 760
 761    switch (ir->operation) {
 762    case ir_unop_logic_not:
 763       emit(NOT(this->result, op[0]));
 764       break;
 765    case ir_unop_neg:
 766       op[0].negate = !op[0].negate;
 767       emit(MOV(this->result, op[0]));
 768       break;
 769    case ir_unop_abs:
 770       op[0].abs = true;
 771       op[0].negate = false;
 772       emit(MOV(this->result, op[0]));
 773       break;
 774    case ir_unop_sign:
 775       if (ir->type->is_float()) {
 776          /* AND(val, 0x80000000) gives the sign bit.
 777           *
 778           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 779           * zero.
 780           */
 781          emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 782
 783          op[0].type = BRW_REGISTER_TYPE_UD;
 784          this->result.type = BRW_REGISTER_TYPE_UD;
 785          emit(AND(this->result, op[0], fs_reg(0x80000000u)));
 786
 787          inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
 788          inst->predicate = BRW_PREDICATE_NORMAL;
 789
 790          this->result.type = BRW_REGISTER_TYPE_F;
 791       } else {
 792          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 793           *               -> non-negative val generates 0x00000000.
 794           *  Predicated OR sets 1 if val is positive.
 795           */
 796          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
 797
 798          emit(ASR(this->result, op[0], fs_reg(31)));
 799
 800          inst = emit(OR(this->result, this->result, fs_reg(1)));
 801          inst->predicate = BRW_PREDICATE_NORMAL;
 802       }
 803       break;
 804    case ir_unop_rcp:
 805       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 806       break;
 807
 808    case ir_unop_exp2:
 809       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 810       break;
 811    case ir_unop_log2:
 812       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 813       break;
 814    case ir_unop_exp:
 815    case ir_unop_log:
 816       unreachable("not reached: should be handled by ir_explog_to_explog2");
 817    case ir_unop_sin:
 818       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 819       break;
 820    case ir_unop_cos:
 821       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 822       break;
 823
 824    case ir_unop_dFdx:
 825       /* Select one of the two opcodes based on the glHint value. */
 826       if (fs_key->high_quality_derivatives)
 827          emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 828       else
 829          emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 830       break;
 831
 832    case ir_unop_dFdx_coarse:
 833       emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 834       break;
 835
 836    case ir_unop_dFdx_fine:
 837       emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 838       break;
 839
 840    case ir_unop_dFdy:
 841       /* Select one of the two opcodes based on the glHint value. */
 842       if (fs_key->high_quality_derivatives)
 843          emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 844       else
 845          emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 846       break;
 847
 848    case ir_unop_dFdy_coarse:
 849       emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 850       break;
 851
 852    case ir_unop_dFdy_fine:
 853       emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 854       break;
 855
 856    case ir_binop_add:
 857       emit(ADD(this->result, op[0], op[1]));
 858       break;
 859    case ir_binop_sub:
 860       unreachable("not reached: should be handled by ir_sub_to_add_neg");
 861
 862    case ir_binop_mul:
 863       if (brw->gen < 8 && ir->type->is_integer()) {
 864          /* For integer multiplication, the MUL uses the low 16 bits
 865           * of one of the operands (src0 on gen6, src1 on gen7).  The
 866           * MACH accumulates in the contribution of the upper 16 bits
 867           * of that operand.
 868           */
 869          if (ir->operands[0]->is_uint16_constant()) {
 870             if (brw->gen < 7)
 871                emit(MUL(this->result, op[0], op[1]));
 872             else
 873                emit(MUL(this->result, op[1], op[0]));
 874          } else if (ir->operands[1]->is_uint16_constant()) {
 875             if (brw->gen < 7)
 876                emit(MUL(this->result, op[1], op[0]));
 877             else
 878                emit(MUL(this->result, op[0], op[1]));
 879          } else {
 880             if (brw->gen >= 7)
 881                no16("SIMD16 explicit accumulator operands unsupported\n");
 882
 883             struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 884                                         this->result.type);
 885
 886             emit(MUL(acc, op[0], op[1]));
 887             emit(MACH(reg_null_d, op[0], op[1]));
 888             emit(MOV(this->result, fs_reg(acc)));
 889          }
 890       } else {
 891          emit(MUL(this->result, op[0], op[1]));
 892       }
 893       break;
 894    case ir_binop_imul_high: {
 895       if (brw->gen == 7)
 896          no16("SIMD16 explicit accumulator operands unsupported\n");
 897
 898       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 899                                   this->result.type);
 900
 901       fs_inst *mul = emit(MUL(acc, op[0], op[1]));
 902       emit(MACH(this->result, op[0], op[1]));
 903
 904       /* Until Gen8, integer multiplies read 32-bits from one source, and
 905        * 16-bits from the other, and relying on the MACH instruction to
 906        * generate the high bits of the result.
 907        *
 908        * On Gen8, the multiply instruction does a full 32x32-bit multiply,
 909        * but in order to do a 64x64-bit multiply we have to simulate the
 910        * previous behavior and then use a MACH instruction.
 911        *
 912        * FINISHME: Don't use source modifiers on src1.
 913        */
 914       if (brw->gen >= 8) {
 915          assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
 916                 mul->src[1].type == BRW_REGISTER_TYPE_UD);
 917          if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
 918             mul->src[1].type = BRW_REGISTER_TYPE_W;
 919          } else {
 920             mul->src[1].type = BRW_REGISTER_TYPE_UW;
 921          }
 922       }
 923
 924       break;
 925    }
 926    case ir_binop_div:
 927       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 928       assert(ir->type->is_integer());
 929       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 930       break;
 931    case ir_binop_carry: {
 932       if (brw->gen == 7)
 933          no16("SIMD16 explicit accumulator operands unsupported\n");
 934
 935       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 936                                   BRW_REGISTER_TYPE_UD);
 937
 938       emit(ADDC(reg_null_ud, op[0], op[1]));
 939       emit(MOV(this->result, fs_reg(acc)));
 940       break;
 941    }
 942    case ir_binop_borrow: {
 943       if (brw->gen == 7)
 944          no16("SIMD16 explicit accumulator operands unsupported\n");
 945
 946       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 947                                   BRW_REGISTER_TYPE_UD);
 948
 949       emit(SUBB(reg_null_ud, op[0], op[1]));
 950       emit(MOV(this->result, fs_reg(acc)));
 951       break;
 952    }
 953    case ir_binop_mod:
 954       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
 955       assert(ir->type->is_integer());
 956       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 957       break;
 958
 959    case ir_binop_less:
 960    case ir_binop_greater:
 961    case ir_binop_lequal:
 962    case ir_binop_gequal:
 963    case ir_binop_equal:
 964    case ir_binop_all_equal:
 965    case ir_binop_nequal:
 966    case ir_binop_any_nequal:
 967       if (brw->gen <= 5) {
 968          resolve_bool_comparison(ir->operands[0], &op[0]);
 969          resolve_bool_comparison(ir->operands[1], &op[1]);
 970       }
 971
 972       emit(CMP(this->result, op[0], op[1],
 973                brw_conditional_for_comparison(ir->operation)));
 974       break;
 975
 976    case ir_binop_logic_xor:
 977       emit(XOR(this->result, op[0], op[1]));
 978       break;
 979
 980    case ir_binop_logic_or:
 981       emit(OR(this->result, op[0], op[1]));
 982       break;
 983
 984    case ir_binop_logic_and:
 985       emit(AND(this->result, op[0], op[1]));
 986       break;
 987
 988    case ir_binop_dot:
 989    case ir_unop_any:
 990       unreachable("not reached: should be handled by brw_fs_channel_expressions");
 991
 992    case ir_unop_noise:
 993       unreachable("not reached: should be handled by lower_noise");
 994
 995    case ir_quadop_vector:
 996       unreachable("not reached: should be handled by lower_quadop_vector");
 997
 998    case ir_binop_vector_extract:
 999       unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
1000
1001    case ir_triop_vector_insert:
1002       unreachable("not reached: should be handled by lower_vector_insert()");
1003
1004    case ir_binop_ldexp:
1005       unreachable("not reached: should be handled by ldexp_to_arith()");
1006
1007    case ir_unop_sqrt:
1008       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
1009       break;
1010
1011    case ir_unop_rsq:
1012       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
1013       break;
1014
1015    case ir_unop_bitcast_i2f:
1016    case ir_unop_bitcast_u2f:
1017       op[0].type = BRW_REGISTER_TYPE_F;
1018       this->result = op[0];
1019       break;
1020    case ir_unop_i2u:
1021    case ir_unop_bitcast_f2u:
1022       op[0].type = BRW_REGISTER_TYPE_UD;
1023       this->result = op[0];
1024       break;
1025    case ir_unop_u2i:
1026    case ir_unop_bitcast_f2i:
1027       op[0].type = BRW_REGISTER_TYPE_D;
1028       this->result = op[0];
1029       break;
1030    case ir_unop_i2f:
1031    case ir_unop_u2f:
1032    case ir_unop_f2i:
1033    case ir_unop_f2u:
1034       emit(MOV(this->result, op[0]));
1035       break;
1036
1037    case ir_unop_b2i:
1038       emit(AND(this->result, op[0], fs_reg(1)));
1039       break;
1040    case ir_unop_b2f:
1041       if (brw->gen <= 5) {
1042          resolve_bool_comparison(ir->operands[0], &op[0]);
1043       }
1044       op[0].type = BRW_REGISTER_TYPE_D;
1045       this->result.type = BRW_REGISTER_TYPE_D;
1046       emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
1047       this->result.type = BRW_REGISTER_TYPE_F;
1048       break;
1049
1050    case ir_unop_f2b:
1051       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
1052       break;
1053    case ir_unop_i2b:
1054       emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1055       break;
1056
1057    case ir_unop_trunc:
1058       emit(RNDZ(this->result, op[0]));
1059       break;
1060    case ir_unop_ceil: {
1061          fs_reg tmp = vgrf(ir->type);
1062          op[0].negate = !op[0].negate;
1063          emit(RNDD(tmp, op[0]));
1064          tmp.negate = true;
1065          emit(MOV(this->result, tmp));
1066       }
1067       break;
1068    case ir_unop_floor:
1069       emit(RNDD(this->result, op[0]));
1070       break;
1071    case ir_unop_fract:
1072       emit(FRC(this->result, op[0]));
1073       break;
1074    case ir_unop_round_even:
1075       emit(RNDE(this->result, op[0]));
1076       break;
1077
1078    case ir_binop_min:
1079    case ir_binop_max:
1080       resolve_ud_negate(&op[0]);
1081       resolve_ud_negate(&op[1]);
1082       emit_minmax(ir->operation == ir_binop_min ?
1083                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
1084                   this->result, op[0], op[1]);
1085       break;
1086    case ir_unop_pack_snorm_2x16:
1087    case ir_unop_pack_snorm_4x8:
1088    case ir_unop_pack_unorm_2x16:
1089    case ir_unop_pack_unorm_4x8:
1090    case ir_unop_unpack_snorm_2x16:
1091    case ir_unop_unpack_snorm_4x8:
1092    case ir_unop_unpack_unorm_2x16:
1093    case ir_unop_unpack_unorm_4x8:
1094    case ir_unop_unpack_half_2x16:
1095    case ir_unop_pack_half_2x16:
1096       unreachable("not reached: should be handled by lower_packing_builtins");
1097    case ir_unop_unpack_half_2x16_split_x:
1098       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
1099       break;
1100    case ir_unop_unpack_half_2x16_split_y:
1101       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
1102       break;
1103    case ir_binop_pow:
1104       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
1105       break;
1106
1107    case ir_unop_bitfield_reverse:
1108       emit(BFREV(this->result, op[0]));
1109       break;
1110    case ir_unop_bit_count:
1111       emit(CBIT(this->result, op[0]));
1112       break;
1113    case ir_unop_find_msb:
1114       temp = vgrf(glsl_type::uint_type);
1115       emit(FBH(temp, op[0]));
1116
1117       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1118        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1119        * subtract the result from 31 to convert the MSB count into an LSB count.
1120        */
1121
1122       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1123       emit(MOV(this->result, temp));
1124       emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
1125
1126       temp.negate = true;
1127       inst = emit(ADD(this->result, temp, fs_reg(31)));
1128       inst->predicate = BRW_PREDICATE_NORMAL;
1129       break;
1130    case ir_unop_find_lsb:
1131       emit(FBL(this->result, op[0]));
1132       break;
1133    case ir_unop_saturate:
1134       inst = emit(MOV(this->result, op[0]));
1135       inst->saturate = true;
1136       break;
1137    case ir_triop_bitfield_extract:
1138       /* Note that the instruction's argument order is reversed from GLSL
1139        * and the IR.
1140        */
1141       emit(BFE(this->result, op[2], op[1], op[0]));
1142       break;
1143    case ir_binop_bfm:
1144       emit(BFI1(this->result, op[0], op[1]));
1145       break;
1146    case ir_triop_bfi:
1147       emit(BFI2(this->result, op[0], op[1], op[2]));
1148       break;
1149    case ir_quadop_bitfield_insert:
1150       unreachable("not reached: should be handled by "
1151               "lower_instructions::bitfield_insert_to_bfm_bfi");
1152
1153    case ir_unop_bit_not:
1154       emit(NOT(this->result, op[0]));
1155       break;
1156    case ir_binop_bit_and:
1157       emit(AND(this->result, op[0], op[1]));
1158       break;
1159    case ir_binop_bit_xor:
1160       emit(XOR(this->result, op[0], op[1]));
1161       break;
1162    case ir_binop_bit_or:
1163       emit(OR(this->result, op[0], op[1]));
1164       break;
1165
1166    case ir_binop_lshift:
1167       emit(SHL(this->result, op[0], op[1]));
1168       break;
1169
1170    case ir_binop_rshift:
1171       if (ir->type->base_type == GLSL_TYPE_INT)
1172          emit(ASR(this->result, op[0], op[1]));
1173       else
1174          emit(SHR(this->result, op[0], op[1]));
1175       break;
1176    case ir_binop_pack_half_2x16_split:
1177       emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
1178       break;
1179    case ir_binop_ubo_load: {
1180       /* This IR node takes a constant uniform block and a constant or
1181        * variable byte offset within the block and loads a vector from that.
1182        */
1183       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1184       ir_constant *const_offset = ir->operands[1]->as_constant();
1185       fs_reg surf_index;
1186
1187       if (const_uniform_block) {
1188          /* The block index is a constant, so just emit the binding table entry
1189           * as an immediate.
1190           */
1191          surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
1192                                  const_uniform_block->value.u[0]);
1193       } else {
1194          /* The block index is not a constant. Evaluate the index expression
1195           * per-channel and add the base UBO index; the generator will select
1196           * a value from any live channel.
1197           */
1198          surf_index = vgrf(glsl_type::uint_type);
1199          emit(ADD(surf_index, op[0],
1200                   fs_reg(stage_prog_data->binding_table.ubo_start)))
1201             ->force_writemask_all = true;
1202
1203          /* Assume this may touch any UBO. It would be nice to provide
1204           * a tighter bound, but the array information is already lowered away.
1205           */
1206          brw_mark_surface_used(prog_data,
1207                                stage_prog_data->binding_table.ubo_start +
1208                                shader_prog->NumUniformBlocks - 1);
1209       }
1210
1211       if (const_offset) {
1212          fs_reg packed_consts = vgrf(glsl_type::float_type);
1213          packed_consts.type = result.type;
1214
1215          fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
1216          emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
1217                                    packed_consts, surf_index, const_offset_reg));
1218
1219          for (int i = 0; i < ir->type->vector_elements; i++) {
1220             packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
1221
1222             /* The std140 packing rules don't allow vectors to cross 16-byte
1223              * boundaries, and a reg is 32 bytes.
1224              */
1225             assert(packed_consts.subreg_offset < 32);
1226
1227             /* UBO bools are any nonzero value.  We consider bools to be
1228              * values with the low bit set to 1.  Convert them using CMP.
1229              */
1230             if (ir->type->base_type == GLSL_TYPE_BOOL) {
1231                emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
1232             } else {
1233                emit(MOV(result, packed_consts));
1234             }
1235
1236             result = offset(result, 1);
1237          }
1238       } else {
1239          /* Turn the byte offset into a dword offset. */
1240          fs_reg base_offset = vgrf(glsl_type::int_type);
1241          emit(SHR(base_offset, op[1], fs_reg(2)));
1242
1243          for (int i = 0; i < ir->type->vector_elements; i++) {
1244             emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
1245                                             base_offset, i));
1246
1247             if (ir->type->base_type == GLSL_TYPE_BOOL)
1248                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
1249
1250             result = offset(result, 1);
1251          }
1252       }
1253
1254       result.reg_offset = 0;
1255       break;
1256    }
1257
1258    case ir_triop_fma:
1259       /* Note that the instruction's argument order is reversed from GLSL
1260        * and the IR.
1261        */
1262       emit(MAD(this->result, op[2], op[1], op[0]));
1263       break;
1264
1265    case ir_triop_lrp:
1266       emit_lrp(this->result, op[0], op[1], op[2]);
1267       break;
1268
1269    case ir_triop_csel:
1270    case ir_unop_interpolate_at_centroid:
1271    case ir_binop_interpolate_at_offset:
1272    case ir_binop_interpolate_at_sample:
1273       unreachable("already handled above");
1274       break;
1275
1276    case ir_unop_d2f:
1277    case ir_unop_f2d:
1278    case ir_unop_d2i:
1279    case ir_unop_i2d:
1280    case ir_unop_d2u:
1281    case ir_unop_u2d:
1282    case ir_unop_d2b:
1283    case ir_unop_pack_double_2x32:
1284    case ir_unop_unpack_double_2x32:
1285    case ir_unop_frexp_sig:
1286    case ir_unop_frexp_exp:
1287       unreachable("fp64 todo");
1288       break;
1289    }
1290 }
1291
1292 void
1293 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1294                                    const glsl_type *type, bool predicated)
1295 {
1296    switch (type->base_type) {
1297    case GLSL_TYPE_FLOAT:
1298    case GLSL_TYPE_UINT:
1299    case GLSL_TYPE_INT:
1300    case GLSL_TYPE_BOOL:
1301       for (unsigned int i = 0; i < type->components(); i++) {
1302          l.type = brw_type_for_base_type(type);
1303          r.type = brw_type_for_base_type(type);
1304
1305          if (predicated || !l.equals(r)) {
1306             fs_inst *inst = emit(MOV(l, r));
1307             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
1308          }
1309
1310          l = offset(l, 1);
1311          r = offset(r, 1);
1312       }
1313       break;
1314    case GLSL_TYPE_ARRAY:
1315       for (unsigned int i = 0; i < type->length; i++) {
1316          emit_assignment_writes(l, r, type->fields.array, predicated);
1317       }
1318       break;
1319
1320    case GLSL_TYPE_STRUCT:
1321       for (unsigned int i = 0; i < type->length; i++) {
1322          emit_assignment_writes(l, r, type->fields.structure[i].type,
1323                                 predicated);
1324       }
1325       break;
1326
1327    case GLSL_TYPE_SAMPLER:
1328    case GLSL_TYPE_IMAGE:
1329    case GLSL_TYPE_ATOMIC_UINT:
1330       break;
1331
1332    case GLSL_TYPE_DOUBLE:
1333    case GLSL_TYPE_VOID:
1334    case GLSL_TYPE_ERROR:
1335    case GLSL_TYPE_INTERFACE:
1336       unreachable("not reached");
1337    }
1338 }
1339
1340 /* If the RHS processing resulted in an instruction generating a
1341  * temporary value, and it would be easy to rewrite the instruction to
1342  * generate its result right into the LHS instead, do so.  This ends
1343  * up reliably removing instructions where it can be tricky to do so
1344  * later without real UD chain information.
1345  */
1346 bool
1347 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1348                                    fs_reg dst,
1349                                    fs_reg src,
1350                                    fs_inst *pre_rhs_inst,
1351                                    fs_inst *last_rhs_inst)
1352 {
1353    /* Only attempt if we're doing a direct assignment. */
1354    if (ir->condition ||
1355        !(ir->lhs->type->is_scalar() ||
1356         (ir->lhs->type->is_vector() &&
1357          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
1358       return false;
1359
1360    /* Make sure the last instruction generated our source reg. */
1361    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
1362                                                     last_rhs_inst,
1363                                                     src);
1364    if (!modify)
1365       return false;
1366
1367    /* If last_rhs_inst wrote a different number of components than our LHS,
1368     * we can't safely rewrite it.
1369     */
1370    if (alloc.sizes[dst.reg] != modify->regs_written)
1371       return false;
1372
1373    /* Success!  Rewrite the instruction. */
1374    modify->dst = dst;
1375
1376    return true;
1377 }
1378
1379 void
1380 fs_visitor::visit(ir_assignment *ir)
1381 {
1382    fs_reg l, r;
1383    fs_inst *inst;
1384
1385    /* FINISHME: arrays on the lhs */
1386    ir->lhs->accept(this);
1387    l = this->result;
1388
1389    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
1390
1391    ir->rhs->accept(this);
1392    r = this->result;
1393
1394    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
1395
1396    assert(l.file != BAD_FILE);
1397    assert(r.file != BAD_FILE);
1398
1399    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
1400       return;
1401
1402    if (ir->condition) {
1403       emit_bool_to_cond_code(ir->condition);
1404    }
1405
1406    if (ir->lhs->type->is_scalar() ||
1407        ir->lhs->type->is_vector()) {
1408       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1409          if (ir->write_mask & (1 << i)) {
1410             inst = emit(MOV(l, r));
1411             if (ir->condition)
1412                inst->predicate = BRW_PREDICATE_NORMAL;
1413             r = offset(r, 1);
1414          }
1415          l = offset(l, 1);
1416       }
1417    } else {
1418       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1419    }
1420 }
1421
1422 fs_inst *
1423 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
1424                               fs_reg coordinate, int coord_components,
1425                               fs_reg shadow_c,
1426                               fs_reg lod, fs_reg dPdy, int grad_components,
1427                               uint32_t sampler)
1428 {
1429    int mlen;
1430    int base_mrf = 1;
1431    bool simd16 = false;
1432    fs_reg orig_dst;
1433
1434    /* g0 header. */
1435    mlen = 1;
1436
1437    if (shadow_c.file != BAD_FILE) {
1438       for (int i = 0; i < coord_components; i++) {
1439          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1440          coordinate = offset(coordinate, 1);
1441       }
1442
1443       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
1444        * the unused slots must be zeroed.
1445        */
1446       for (int i = coord_components; i < 3; i++) {
1447          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1448       }
1449       mlen += 3;
1450
1451       if (op == ir_tex) {
1452          /* There's no plain shadow compare message, so we use shadow
1453           * compare with a bias of 0.0.
1454           */
1455          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
1456          mlen++;
1457       } else if (op == ir_txb || op == ir_txl) {
1458          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1459          mlen++;
1460       } else {
1461          unreachable("Should not get here.");
1462       }
1463
1464       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1465       mlen++;
1466    } else if (op == ir_tex) {
1467       for (int i = 0; i < coord_components; i++) {
1468          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1469          coordinate = offset(coordinate, 1);
1470       }
1471       /* zero the others. */
1472       for (int i = coord_components; i<3; i++) {
1473          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1474       }
1475       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1476       mlen += 3;
1477    } else if (op == ir_txd) {
1478       fs_reg &dPdx = lod;
1479
1480       for (int i = 0; i < coord_components; i++) {
1481          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1482          coordinate = offset(coordinate, 1);
1483       }
1484       /* the slots for u and v are always present, but r is optional */
1485       mlen += MAX2(coord_components, 2);
1486
1487       /*  P   = u, v, r
1488        * dPdx = dudx, dvdx, drdx
1489        * dPdy = dudy, dvdy, drdy
1490        *
1491        * 1-arg: Does not exist.
1492        *
1493        * 2-arg: dudx   dvdx   dudy   dvdy
1494        *        dPdx.x dPdx.y dPdy.x dPdy.y
1495        *        m4     m5     m6     m7
1496        *
1497        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
1498        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1499        *        m5     m6     m7     m8     m9     m10
1500        */
1501       for (int i = 0; i < grad_components; i++) {
1502          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1503          dPdx = offset(dPdx, 1);
1504       }
1505       mlen += MAX2(grad_components, 2);
1506
1507       for (int i = 0; i < grad_components; i++) {
1508          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1509          dPdy = offset(dPdy, 1);
1510       }
1511       mlen += MAX2(grad_components, 2);
1512    } else if (op == ir_txs) {
1513       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
1514       simd16 = true;
1515       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1516       mlen += 2;
1517    } else {
1518       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1519        * instructions.  We'll need to do SIMD16 here.
1520        */
1521       simd16 = true;
1522       assert(op == ir_txb || op == ir_txl || op == ir_txf);
1523
1524       for (int i = 0; i < coord_components; i++) {
1525          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1526                   coordinate));
1527          coordinate = offset(coordinate, 1);
1528       }
1529
1530       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
1531        * be necessary for TXF (ld), but seems wise to do for all messages.
1532        */
1533       for (int i = coord_components; i < 3; i++) {
1534          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1535       }
1536
1537       /* lod/bias appears after u/v/r. */
1538       mlen += 6;
1539
1540       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1541       mlen++;
1542
1543       /* The unused upper half. */
1544       mlen++;
1545    }
1546
1547    if (simd16) {
1548       /* Now, since we're doing simd16, the return is 2 interleaved
1549        * vec4s where the odd-indexed ones are junk. We'll need to move
1550        * this weirdness around to the expected layout.
1551        */
1552       orig_dst = dst;
1553       dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
1554    }
1555
1556    enum opcode opcode;
1557    switch (op) {
1558    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1559    case ir_txb: opcode = FS_OPCODE_TXB; break;
1560    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1561    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1562    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1563    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1564    default:
1565       unreachable("not reached");
1566    }
1567
1568    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1569    inst->base_mrf = base_mrf;
1570    inst->mlen = mlen;
1571    inst->header_present = true;
1572    inst->regs_written = simd16 ? 8 : 4;
1573
1574    if (simd16) {
1575       for (int i = 0; i < 4; i++) {
1576          emit(MOV(orig_dst, dst));
1577          orig_dst = offset(orig_dst, 1);
1578          dst = offset(dst, 2);
1579       }
1580    }
1581
1582    return inst;
1583 }
1584
1585 fs_inst *
1586 fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
1587                                      fs_reg coordinate, int vector_elements,
1588                                      fs_reg shadow_c, fs_reg lod,
1589                                      uint32_t sampler)
1590 {
1591    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
1592    bool has_lod = op == ir_txl || op == ir_txb || op == ir_txf;
1593
1594    if (has_lod && shadow_c.file != BAD_FILE)
1595       no16("TXB and TXL with shadow comparison unsupported in SIMD16.");
1596
1597    if (op == ir_txd)
1598       no16("textureGrad unsupported in SIMD16.");
1599
1600    /* Copy the coordinates. */
1601    for (int i = 0; i < vector_elements; i++) {
1602       emit(MOV(retype(offset(message, i), coordinate.type), coordinate));
1603       coordinate = offset(coordinate, 1);
1604    }
1605
1606    fs_reg msg_end = offset(message, vector_elements);
1607
1608    /* Messages other than sample and ld require all three components */
1609    if (has_lod || shadow_c.file != BAD_FILE) {
1610       for (int i = vector_elements; i < 3; i++) {
1611          emit(MOV(offset(message, i), fs_reg(0.0f)));
1612       }
1613    }
1614
1615    if (has_lod) {
1616       fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
1617                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
1618       emit(MOV(msg_lod, lod));
1619       msg_end = offset(msg_lod, 1);
1620    }
1621
1622    if (shadow_c.file != BAD_FILE) {
1623       fs_reg msg_ref = offset(message, 3 + has_lod);
1624       emit(MOV(msg_ref, shadow_c));
1625       msg_end = offset(msg_ref, 1);
1626    }
1627
1628    enum opcode opcode;
1629    switch (op) {
1630    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1631    case ir_txb: opcode = FS_OPCODE_TXB;     break;
1632    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1633    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1634    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1635    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1636    default: unreachable("not reached");
1637    }
1638
1639    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1640    inst->base_mrf = message.reg - 1;
1641    inst->mlen = msg_end.reg - inst->base_mrf;
1642    inst->header_present = true;
1643    inst->regs_written = 8;
1644
1645    return inst;
1646 }
1647
1648 /* gen5's sampler has slots for u, v, r, array index, then optional
1649  * parameters like shadow comparitor or LOD bias.  If optional
1650  * parameters aren't present, those base slots are optional and don't
1651  * need to be included in the message.
1652  *
1653  * We don't fill in the unnecessary slots regardless, which may look
1654  * surprising in the disassembly.
1655  */
1656 fs_inst *
1657 fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
1658                               fs_reg coordinate, int vector_elements,
1659                               fs_reg shadow_c,
1660                               fs_reg lod, fs_reg lod2, int grad_components,
1661                               fs_reg sample_index, uint32_t sampler,
1662                               bool has_offset)
1663 {
1664    int reg_width = dispatch_width / 8;
1665    bool header_present = false;
1666
1667    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
1668    fs_reg msg_coords = message;
1669
1670    if (has_offset) {
1671       /* The offsets set up by the ir_texture visitor are in the
1672        * m1 header, so we can't go headerless.
1673        */
1674       header_present = true;
1675       message.reg--;
1676    }
1677
1678    for (int i = 0; i < vector_elements; i++) {
1679       emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
1680       coordinate = offset(coordinate, 1);
1681    }
1682    fs_reg msg_end = offset(msg_coords, vector_elements);
1683    fs_reg msg_lod = offset(msg_coords, 4);
1684
1685    if (shadow_c.file != BAD_FILE) {
1686       fs_reg msg_shadow = msg_lod;
1687       emit(MOV(msg_shadow, shadow_c));
1688       msg_lod = offset(msg_shadow, 1);
1689       msg_end = msg_lod;
1690    }
1691
1692    enum opcode opcode;
1693    switch (op) {
1694    case ir_tex:
1695       opcode = SHADER_OPCODE_TEX;
1696       break;
1697    case ir_txb:
1698       emit(MOV(msg_lod, lod));
1699       msg_end = offset(msg_lod, 1);
1700
1701       opcode = FS_OPCODE_TXB;
1702       break;
1703    case ir_txl:
1704       emit(MOV(msg_lod, lod));
1705       msg_end = offset(msg_lod, 1);
1706
1707       opcode = SHADER_OPCODE_TXL;
1708       break;
1709    case ir_txd: {
1710       /**
1711        *  P   =  u,    v,    r
1712        * dPdx = dudx, dvdx, drdx
1713        * dPdy = dudy, dvdy, drdy
1714        *
1715        * Load up these values:
1716        * - dudx   dudy   dvdx   dvdy   drdx   drdy
1717        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1718        */
1719       msg_end = msg_lod;
1720       for (int i = 0; i < grad_components; i++) {
1721          emit(MOV(msg_end, lod));
1722          lod = offset(lod, 1);
1723          msg_end = offset(msg_end, 1);
1724
1725          emit(MOV(msg_end, lod2));
1726          lod2 = offset(lod2, 1);
1727          msg_end = offset(msg_end, 1);
1728       }
1729
1730       opcode = SHADER_OPCODE_TXD;
1731       break;
1732    }
1733    case ir_txs:
1734       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
1735       emit(MOV(msg_lod, lod));
1736       msg_end = offset(msg_lod, 1);
1737
1738       opcode = SHADER_OPCODE_TXS;
1739       break;
1740    case ir_query_levels:
1741       msg_lod = msg_end;
1742       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1743       msg_end = offset(msg_lod, 1);
1744
1745       opcode = SHADER_OPCODE_TXS;
1746       break;
1747    case ir_txf:
1748       msg_lod = offset(msg_coords, 3);
1749       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
1750       msg_end = offset(msg_lod, 1);
1751
1752       opcode = SHADER_OPCODE_TXF;
1753       break;
1754    case ir_txf_ms:
1755       msg_lod = offset(msg_coords, 3);
1756       /* lod */
1757       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1758       /* sample index */
1759       emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
1760       msg_end = offset(msg_lod, 2);
1761
1762       opcode = SHADER_OPCODE_TXF_CMS;
1763       break;
1764    case ir_lod:
1765       opcode = SHADER_OPCODE_LOD;
1766       break;
1767    case ir_tg4:
1768       opcode = SHADER_OPCODE_TG4;
1769       break;
1770    default:
1771       unreachable("not reached");
1772    }
1773
1774    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1775    inst->base_mrf = message.reg;
1776    inst->mlen = msg_end.reg - message.reg;
1777    inst->header_present = header_present;
1778    inst->regs_written = 4 * reg_width;
1779
1780    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1781       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1782            " disallowed by hardware\n");
1783    }
1784
1785    return inst;
1786 }
1787
1788 static bool
1789 is_high_sampler(struct brw_context *brw, fs_reg sampler)
1790 {
1791    if (brw->gen < 8 && !brw->is_haswell)
1792       return false;
1793
1794    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
1795 }
1796
1797 fs_inst *
1798 fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
1799                               fs_reg coordinate, int coord_components,
1800                               fs_reg shadow_c,
1801                               fs_reg lod, fs_reg lod2, int grad_components,
1802                               fs_reg sample_index, fs_reg mcs, fs_reg sampler,
1803                               fs_reg offset_value)
1804 {
1805    int reg_width = dispatch_width / 8;
1806    bool header_present = false;
1807
1808    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
1809    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
1810       sources[i] = vgrf(glsl_type::float_type);
1811    }
1812    int length = 0;
1813
1814    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
1815        is_high_sampler(brw, sampler)) {
1816       /* For general texture offsets (no txf workaround), we need a header to
1817        * put them in.  Note that for SIMD16 we're making space for two actual
1818        * hardware registers here, so the emit will have to fix up for this.
1819        *
1820        * * ir4_tg4 needs to place its channel select in the header,
1821        * for interaction with ARB_texture_swizzle
1822        *
1823        * The sampler index is only 4-bits, so for larger sampler numbers we
1824        * need to offset the Sampler State Pointer in the header.
1825        */
1826       header_present = true;
1827       sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1828       length++;
1829    }
1830
1831    if (shadow_c.file != BAD_FILE) {
1832       emit(MOV(sources[length], shadow_c));
1833       length++;
1834    }
1835
1836    bool has_nonconstant_offset =
1837       offset_value.file != BAD_FILE && offset_value.file != IMM;
1838    bool coordinate_done = false;
1839
1840    /* The sampler can only meaningfully compute LOD for fragment shader
1841     * messages. For all other stages, we change the opcode to ir_txl and
1842     * hardcode the LOD to 0.
1843     */
1844    if (stage != MESA_SHADER_FRAGMENT && op == ir_tex) {
1845       op = ir_txl;
1846       lod = fs_reg(0.0f);
1847    }
1848
1849    /* Set up the LOD info */
1850    switch (op) {
1851    case ir_tex:
1852    case ir_lod:
1853       break;
1854    case ir_txb:
1855       emit(MOV(sources[length], lod));
1856       length++;
1857       break;
1858    case ir_txl:
1859       emit(MOV(sources[length], lod));
1860       length++;
1861       break;
1862    case ir_txd: {
1863       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1864
1865       /* Load dPdx and the coordinate together:
1866        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1867        */
1868       for (int i = 0; i < coord_components; i++) {
1869          emit(MOV(sources[length], coordinate));
1870          coordinate = offset(coordinate, 1);
1871          length++;
1872
1873          /* For cube map array, the coordinate is (u,v,r,ai) but there are
1874           * only derivatives for (u, v, r).
1875           */
1876          if (i < grad_components) {
1877             emit(MOV(sources[length], lod));
1878             lod = offset(lod, 1);
1879             length++;
1880
1881             emit(MOV(sources[length], lod2));
1882             lod2 = offset(lod2, 1);
1883             length++;
1884          }
1885       }
1886
1887       coordinate_done = true;
1888       break;
1889    }
1890    case ir_txs:
1891       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
1892       length++;
1893       break;
1894    case ir_query_levels:
1895       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1896       length++;
1897       break;
1898    case ir_txf:
1899       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
1900        * On Gen9 they are u, v, lod, r
1901        */
1902
1903       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1904       coordinate = offset(coordinate, 1);
1905       length++;
1906
1907       if (brw->gen >= 9) {
1908          if (coord_components >= 2) {
1909             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1910             coordinate = offset(coordinate, 1);
1911          }
1912          length++;
1913       }
1914
1915       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
1916       length++;
1917
1918       for (int i = brw->gen >= 9 ? 2 : 1; i < coord_components; i++) {
1919          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1920          coordinate = offset(coordinate, 1);
1921          length++;
1922       }
1923
1924       coordinate_done = true;
1925       break;
1926    case ir_txf_ms:
1927       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
1928       length++;
1929
1930       /* data from the multisample control surface */
1931       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
1932       length++;
1933
1934       /* there is no offsetting for this message; just copy in the integer
1935        * texture coordinates
1936        */
1937       for (int i = 0; i < coord_components; i++) {
1938          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1939          coordinate = offset(coordinate, 1);
1940          length++;
1941       }
1942
1943       coordinate_done = true;
1944       break;
1945    case ir_tg4:
1946       if (has_nonconstant_offset) {
1947          if (shadow_c.file != BAD_FILE)
1948             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
1949
1950          /* More crazy intermixing */
1951          for (int i = 0; i < 2; i++) { /* u, v */
1952             emit(MOV(sources[length], coordinate));
1953             coordinate = offset(coordinate, 1);
1954             length++;
1955          }
1956
1957          for (int i = 0; i < 2; i++) { /* offu, offv */
1958             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
1959             offset_value = offset(offset_value, 1);
1960             length++;
1961          }
1962
1963          if (coord_components == 3) { /* r if present */
1964             emit(MOV(sources[length], coordinate));
1965             coordinate = offset(coordinate, 1);
1966             length++;
1967          }
1968
1969          coordinate_done = true;
1970       }
1971       break;
1972    }
1973
1974    /* Set up the coordinate (except for cases where it was done above) */
1975    if (!coordinate_done) {
1976       for (int i = 0; i < coord_components; i++) {
1977          emit(MOV(sources[length], coordinate));
1978          coordinate = offset(coordinate, 1);
1979          length++;
1980       }
1981    }
1982
1983    int mlen;
1984    if (reg_width == 2)
1985       mlen = length * reg_width - header_present;
1986    else
1987       mlen = length * reg_width;
1988
1989    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
1990                                BRW_REGISTER_TYPE_F);
1991    emit(LOAD_PAYLOAD(src_payload, sources, length));
1992
1993    /* Generate the SEND */
1994    enum opcode opcode;
1995    switch (op) {
1996    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1997    case ir_txb: opcode = FS_OPCODE_TXB; break;
1998    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1999    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2000    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2001    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2002    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2003    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2004    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
2005    case ir_tg4:
2006       if (has_nonconstant_offset)
2007          opcode = SHADER_OPCODE_TG4_OFFSET;
2008       else
2009          opcode = SHADER_OPCODE_TG4;
2010       break;
2011    default:
2012       unreachable("not reached");
2013    }
2014    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
2015    inst->base_mrf = -1;
2016    inst->mlen = mlen;
2017    inst->header_present = header_present;
2018    inst->regs_written = 4 * reg_width;
2019
2020    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
2021       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
2022            " disallowed by hardware\n");
2023    }
2024
2025    return inst;
2026 }
2027
2028 fs_reg
2029 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
2030                              bool is_rect, uint32_t sampler, int texunit)
2031 {
2032    fs_inst *inst = NULL;
2033    bool needs_gl_clamp = true;
2034    fs_reg scale_x, scale_y;
2035
2036    /* The 965 requires the EU to do the normalization of GL rectangle
2037     * texture coordinates.  We use the program parameter state
2038     * tracking to get the scaling factor.
2039     */
2040    if (is_rect &&
2041        (brw->gen < 6 ||
2042         (brw->gen >= 6 && (key_tex->gl_clamp_mask[0] & (1 << sampler) ||
2043                            key_tex->gl_clamp_mask[1] & (1 << sampler))))) {
2044       struct gl_program_parameter_list *params = prog->Parameters;
2045       int tokens[STATE_LENGTH] = {
2046          STATE_INTERNAL,
2047          STATE_TEXRECT_SCALE,
2048          texunit,
2049          0,
2050          0
2051       };
2052
2053       no16("rectangle scale uniform setup not supported on SIMD16\n");
2054       if (dispatch_width == 16) {
2055          return coordinate;
2056       }
2057
2058       GLuint index = _mesa_add_state_reference(params,
2059                                                (gl_state_index *)tokens);
2060       /* Try to find existing copies of the texrect scale uniforms. */
2061       for (unsigned i = 0; i < uniforms; i++) {
2062          if (stage_prog_data->param[i] ==
2063              &prog->Parameters->ParameterValues[index][0]) {
2064             scale_x = fs_reg(UNIFORM, i);
2065             scale_y = fs_reg(UNIFORM, i + 1);
2066             break;
2067          }
2068       }
2069
2070       /* If we didn't already set them up, do so now. */
2071       if (scale_x.file == BAD_FILE) {
2072          scale_x = fs_reg(UNIFORM, uniforms);
2073          scale_y = fs_reg(UNIFORM, uniforms + 1);
2074
2075          stage_prog_data->param[uniforms++] =
2076             &prog->Parameters->ParameterValues[index][0];
2077          stage_prog_data->param[uniforms++] =
2078             &prog->Parameters->ParameterValues[index][1];
2079       }
2080    }
2081
2082    /* The 965 requires the EU to do the normalization of GL rectangle
2083     * texture coordinates.  We use the program parameter state
2084     * tracking to get the scaling factor.
2085     */
2086    if (brw->gen < 6 && is_rect) {
2087       fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
2088       fs_reg src = coordinate;
2089       coordinate = dst;
2090
2091       emit(MUL(dst, src, scale_x));
2092       dst = offset(dst, 1);
2093       src = offset(src, 1);
2094       emit(MUL(dst, src, scale_y));
2095    } else if (is_rect) {
2096       /* On gen6+, the sampler handles the rectangle coordinates
2097        * natively, without needing rescaling.  But that means we have
2098        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
2099        * not [0, 1] like the default case below.
2100        */
2101       needs_gl_clamp = false;
2102
2103       for (int i = 0; i < 2; i++) {
2104          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
2105             fs_reg chan = coordinate;
2106             chan = offset(chan, i);
2107
2108             inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
2109             inst->conditional_mod = BRW_CONDITIONAL_GE;
2110
2111             /* Our parameter comes in as 1.0/width or 1.0/height,
2112              * because that's what people normally want for doing
2113              * texture rectangle handling.  We need width or height
2114              * for clamping, but we don't care enough to make a new
2115              * parameter type, so just invert back.
2116              */
2117             fs_reg limit = vgrf(glsl_type::float_type);
2118             emit(MOV(limit, i == 0 ? scale_x : scale_y));
2119             emit(SHADER_OPCODE_RCP, limit, limit);
2120
2121             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
2122             inst->conditional_mod = BRW_CONDITIONAL_L;
2123          }
2124       }
2125    }
2126
2127    if (coord_components > 0 && needs_gl_clamp) {
2128       for (int i = 0; i < MIN2(coord_components, 3); i++) {
2129          if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
2130             fs_reg chan = coordinate;
2131             chan = offset(chan, i);
2132
2133             fs_inst *inst = emit(MOV(chan, chan));
2134             inst->saturate = true;
2135          }
2136       }
2137    }
2138    return coordinate;
2139 }
2140
2141 /* Sample from the MCS surface attached to this multisample texture. */
2142 fs_reg
2143 fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
2144 {
2145    int reg_width = dispatch_width / 8;
2146    fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
2147                            BRW_REGISTER_TYPE_F);
2148    fs_reg dest = vgrf(glsl_type::uvec4_type);
2149    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
2150
2151    /* parameters are: u, v, r; missing parameters are treated as zero */
2152    for (int i = 0; i < components; i++) {
2153       sources[i] = vgrf(glsl_type::float_type);
2154       emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
2155       coordinate = offset(coordinate, 1);
2156    }
2157
2158    emit(LOAD_PAYLOAD(payload, sources, components));
2159
2160    fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
2161    inst->base_mrf = -1;
2162    inst->mlen = components * reg_width;
2163    inst->header_present = false;
2164    inst->regs_written = 4 * reg_width; /* we only care about one reg of
2165                                         * response, but the sampler always
2166                                         * writes 4/8
2167                                         */
2168
2169    return dest;
2170 }
2171
2172 void
2173 fs_visitor::emit_texture(ir_texture_opcode op,
2174                          const glsl_type *dest_type,
2175                          fs_reg coordinate, int coord_components,
2176                          fs_reg shadow_c,
2177                          fs_reg lod, fs_reg lod2, int grad_components,
2178                          fs_reg sample_index,
2179                          fs_reg offset_value,
2180                          fs_reg mcs,
2181                          int gather_component,
2182                          bool is_cube_array,
2183                          bool is_rect,
2184                          uint32_t sampler,
2185                          fs_reg sampler_reg, int texunit)
2186 {
2187    fs_inst *inst = NULL;
2188
2189    if (op == ir_tg4) {
2190       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2191        * emitting anything other than setting up the constant result.
2192        */
2193       int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2194       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2195
2196          fs_reg res = vgrf(glsl_type::vec4_type);
2197          this->result = res;
2198
2199          for (int i=0; i<4; i++) {
2200             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
2201             res = offset(res, 1);
2202          }
2203          return;
2204       }
2205    }
2206
2207    if (coordinate.file != BAD_FILE) {
2208       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
2209        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
2210        */
2211       coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
2212                                     sampler, texunit);
2213    }
2214
2215    /* Writemasking doesn't eliminate channels on SIMD8 texture
2216     * samples, so don't worry about them.
2217     */
2218    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
2219
2220    if (brw->gen >= 7) {
2221       inst = emit_texture_gen7(op, dst, coordinate, coord_components,
2222                                shadow_c, lod, lod2, grad_components,
2223                                sample_index, mcs, sampler_reg,
2224                                offset_value);
2225    } else if (brw->gen >= 5) {
2226       inst = emit_texture_gen5(op, dst, coordinate, coord_components,
2227                                shadow_c, lod, lod2, grad_components,
2228                                sample_index, sampler,
2229                                offset_value.file != BAD_FILE);
2230    } else if (dispatch_width == 16) {
2231       inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
2232                                       shadow_c, lod, sampler);
2233    } else {
2234       inst = emit_texture_gen4(op, dst, coordinate, coord_components,
2235                                shadow_c, lod, lod2, grad_components,
2236                                sampler);
2237    }
2238
2239    if (shadow_c.file != BAD_FILE)
2240       inst->shadow_compare = true;
2241
2242    if (offset_value.file == IMM)
2243       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
2244
2245    if (op == ir_tg4) {
2246       inst->offset |=
2247          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
2248
2249       if (brw->gen == 6)
2250          emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
2251    }
2252
2253    /* fixup #layers for cube map arrays */
2254    if (op == ir_txs && is_cube_array) {
2255       fs_reg depth = offset(dst, 2);
2256       fs_reg fixed_depth = vgrf(glsl_type::int_type);
2257       emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
2258
2259       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
2260       int components = inst->regs_written / (dst.width / 8);
2261       for (int i = 0; i < components; i++) {
2262          if (i == 2) {
2263             fixed_payload[i] = fixed_depth;
2264          } else {
2265             fixed_payload[i] = offset(dst, i);
2266          }
2267       }
2268       emit(LOAD_PAYLOAD(dst, fixed_payload, components));
2269    }
2270
2271    swizzle_result(op, dest_type->vector_elements, dst, sampler);
2272 }
2273
2274 void
2275 fs_visitor::visit(ir_texture *ir)
2276 {
2277    uint32_t sampler =
2278       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2279
2280    ir_rvalue *nonconst_sampler_index =
2281       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2282
2283    /* Handle non-constant sampler array indexing */
2284    fs_reg sampler_reg;
2285    if (nonconst_sampler_index) {
2286       /* The highest sampler which may be used by this operation is
2287        * the last element of the array. Mark it here, because the generator
2288        * doesn't have enough information to determine the bound.
2289        */
2290       uint32_t array_size = ir->sampler->as_dereference_array()
2291          ->array->type->array_size();
2292
2293       uint32_t max_used = sampler + array_size - 1;
2294       if (ir->op == ir_tg4 && brw->gen < 8) {
2295          max_used += stage_prog_data->binding_table.gather_texture_start;
2296       } else {
2297          max_used += stage_prog_data->binding_table.texture_start;
2298       }
2299
2300       brw_mark_surface_used(prog_data, max_used);
2301
2302       /* Emit code to evaluate the actual indexing expression */
2303       nonconst_sampler_index->accept(this);
2304       fs_reg temp = vgrf(glsl_type::uint_type);
2305       emit(ADD(temp, this->result, fs_reg(sampler)))
2306             ->force_writemask_all = true;
2307       sampler_reg = temp;
2308    } else {
2309       /* Single sampler, or constant array index; the indexing expression
2310        * is just an immediate.
2311        */
2312       sampler_reg = fs_reg(sampler);
2313    }
2314
2315    /* FINISHME: We're failing to recompile our programs when the sampler is
2316     * updated.  This only matters for the texture rectangle scale parameters
2317     * (pre-gen6, or gen6+ with GL_CLAMP).
2318     */
2319    int texunit = prog->SamplerUnits[sampler];
2320
2321    /* Should be lowered by do_lower_texture_projection */
2322    assert(!ir->projector);
2323
2324    /* Should be lowered */
2325    assert(!ir->offset || !ir->offset->type->is_array());
2326
2327    /* Generate code to compute all the subexpression trees.  This has to be
2328     * done before loading any values into MRFs for the sampler message since
2329     * generating these values may involve SEND messages that need the MRFs.
2330     */
2331    fs_reg coordinate;
2332    int coord_components = 0;
2333    if (ir->coordinate) {
2334       coord_components = ir->coordinate->type->vector_elements;
2335       ir->coordinate->accept(this);
2336       coordinate = this->result;
2337    }
2338
2339    fs_reg shadow_comparitor;
2340    if (ir->shadow_comparitor) {
2341       ir->shadow_comparitor->accept(this);
2342       shadow_comparitor = this->result;
2343    }
2344
2345    fs_reg offset_value;
2346    if (ir->offset) {
2347       ir_constant *const_offset = ir->offset->as_constant();
2348       if (const_offset) {
2349          /* Store the header bitfield in an IMM register.  This allows us to
2350           * use offset_value.file to distinguish between no offset, a constant
2351           * offset, and a non-constant offset.
2352           */
2353          offset_value =
2354             fs_reg(brw_texture_offset(const_offset->value.i,
2355                                       const_offset->type->vector_elements));
2356       } else {
2357          ir->offset->accept(this);
2358          offset_value = this->result;
2359       }
2360    }
2361
2362    fs_reg lod, lod2, sample_index, mcs;
2363    int grad_components = 0;
2364    switch (ir->op) {
2365    case ir_tex:
2366    case ir_lod:
2367    case ir_tg4:
2368    case ir_query_levels:
2369       break;
2370    case ir_txb:
2371       ir->lod_info.bias->accept(this);
2372       lod = this->result;
2373       break;
2374    case ir_txd:
2375       ir->lod_info.grad.dPdx->accept(this);
2376       lod = this->result;
2377
2378       ir->lod_info.grad.dPdy->accept(this);
2379       lod2 = this->result;
2380
2381       grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
2382       break;
2383    case ir_txf:
2384    case ir_txl:
2385    case ir_txs:
2386       ir->lod_info.lod->accept(this);
2387       lod = this->result;
2388       break;
2389    case ir_txf_ms:
2390       ir->lod_info.sample_index->accept(this);
2391       sample_index = this->result;
2392
2393       if (brw->gen >= 7 &&
2394           key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
2395          mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
2396                               sampler_reg);
2397       } else {
2398          mcs = fs_reg(0u);
2399       }
2400       break;
2401    default:
2402       unreachable("Unrecognized texture opcode");
2403    };
2404
2405    int gather_component = 0;
2406    if (ir->op == ir_tg4)
2407       gather_component = ir->lod_info.component->as_constant()->value.i[0];
2408
2409    bool is_rect =
2410       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
2411
2412    bool is_cube_array =
2413       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2414       ir->sampler->type->sampler_array;
2415
2416    emit_texture(ir->op, ir->type, coordinate, coord_components,
2417                 shadow_comparitor, lod, lod2, grad_components,
2418                 sample_index, offset_value, mcs,
2419                 gather_component, is_cube_array, is_rect, sampler,
2420                 sampler_reg, texunit);
2421 }
2422
2423 /**
2424  * Apply workarounds for Gen6 gather with UINT/SINT
2425  */
2426 void
2427 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
2428 {
2429    if (!wa)
2430       return;
2431
2432    int width = (wa & WA_8BIT) ? 8 : 16;
2433
2434    for (int i = 0; i < 4; i++) {
2435       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
2436       /* Convert from UNORM to UINT */
2437       emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
2438       emit(MOV(dst, dst_f));
2439
2440       if (wa & WA_SIGN) {
2441          /* Reinterpret the UINT value as a signed INT value by
2442           * shifting the sign bit into place, then shifting back
2443           * preserving sign.
2444           */
2445          emit(SHL(dst, dst, fs_reg(32 - width)));
2446          emit(ASR(dst, dst, fs_reg(32 - width)));
2447       }
2448
2449       dst = offset(dst, 1);
2450    }
2451 }
2452
2453 /**
2454  * Set up the gather channel based on the swizzle, for gather4.
2455  */
2456 uint32_t
2457 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
2458 {
2459    int swiz = GET_SWZ(key_tex->swizzles[sampler], orig_chan);
2460    switch (swiz) {
2461       case SWIZZLE_X: return 0;
2462       case SWIZZLE_Y:
2463          /* gather4 sampler is broken for green channel on RG32F --
2464           * we must ask for blue instead.
2465           */
2466          if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2467             return 2;
2468          return 1;
2469       case SWIZZLE_Z: return 2;
2470       case SWIZZLE_W: return 3;
2471       default:
2472          unreachable("Not reached"); /* zero, one swizzles handled already */
2473    }
2474 }
2475
2476 /**
2477  * Swizzle the result of a texture result.  This is necessary for
2478  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
2479  */
2480 void
2481 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
2482                            fs_reg orig_val, uint32_t sampler)
2483 {
2484    if (op == ir_query_levels) {
2485       /* # levels is in .w */
2486       this->result = offset(orig_val, 3);
2487       return;
2488    }
2489
2490    this->result = orig_val;
2491
2492    /* txs,lod don't actually sample the texture, so swizzling the result
2493     * makes no sense.
2494     */
2495    if (op == ir_txs || op == ir_lod || op == ir_tg4)
2496       return;
2497
2498    if (dest_components == 1) {
2499       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
2500    } else if (key_tex->swizzles[sampler] != SWIZZLE_NOOP) {
2501       fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
2502       swizzled_result.type = orig_val.type;
2503
2504       for (int i = 0; i < 4; i++) {
2505          int swiz = GET_SWZ(key_tex->swizzles[sampler], i);
2506          fs_reg l = swizzled_result;
2507          l = offset(l, i);
2508
2509          if (swiz == SWIZZLE_ZERO) {
2510             emit(MOV(l, fs_reg(0.0f)));
2511          } else if (swiz == SWIZZLE_ONE) {
2512             emit(MOV(l, fs_reg(1.0f)));
2513          } else {
2514             emit(MOV(l, offset(orig_val,
2515                                GET_SWZ(key_tex->swizzles[sampler], i))));
2516          }
2517       }
2518       this->result = swizzled_result;
2519    }
2520 }
2521
2522 void
2523 fs_visitor::visit(ir_swizzle *ir)
2524 {
2525    ir->val->accept(this);
2526    fs_reg val = this->result;
2527
2528    if (ir->type->vector_elements == 1) {
2529       this->result = offset(this->result, ir->mask.x);
2530       return;
2531    }
2532
2533    fs_reg result = vgrf(ir->type);
2534    this->result = result;
2535
2536    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
2537       fs_reg channel = val;
2538       int swiz = 0;
2539
2540       switch (i) {
2541       case 0:
2542          swiz = ir->mask.x;
2543          break;
2544       case 1:
2545          swiz = ir->mask.y;
2546          break;
2547       case 2:
2548          swiz = ir->mask.z;
2549          break;
2550       case 3:
2551          swiz = ir->mask.w;
2552          break;
2553       }
2554
2555       emit(MOV(result, offset(channel, swiz)));
2556       result = offset(result, 1);
2557    }
2558 }
2559
2560 void
2561 fs_visitor::visit(ir_discard *ir)
2562 {
2563    /* We track our discarded pixels in f0.1.  By predicating on it, we can
2564     * update just the flag bits that aren't yet discarded.  If there's no
2565     * condition, we emit a CMP of g0 != g0, so all currently executing
2566     * channels will get turned off.
2567     */
2568    fs_inst *cmp;
2569    if (ir->condition) {
2570       emit_bool_to_cond_code(ir->condition);
2571       cmp = (fs_inst *) this->instructions.get_tail();
2572       cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
2573    } else {
2574       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2575                                       BRW_REGISTER_TYPE_UW));
2576       cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
2577    }
2578    cmp->predicate = BRW_PREDICATE_NORMAL;
2579    cmp->flag_subreg = 1;
2580
2581    if (brw->gen >= 6) {
2582       emit_discard_jump();
2583    }
2584 }
2585
2586 void
2587 fs_visitor::visit(ir_constant *ir)
2588 {
2589    /* Set this->result to reg at the bottom of the function because some code
2590     * paths will cause this visitor to be applied to other fields.  This will
2591     * cause the value stored in this->result to be modified.
2592     *
2593     * Make reg constant so that it doesn't get accidentally modified along the
2594     * way.  Yes, I actually had this problem. :(
2595     */
2596    const fs_reg reg = vgrf(ir->type);
2597    fs_reg dst_reg = reg;
2598
2599    if (ir->type->is_array()) {
2600       const unsigned size = type_size(ir->type->fields.array);
2601
2602       for (unsigned i = 0; i < ir->type->length; i++) {
2603          ir->array_elements[i]->accept(this);
2604          fs_reg src_reg = this->result;
2605
2606          dst_reg.type = src_reg.type;
2607          for (unsigned j = 0; j < size; j++) {
2608             emit(MOV(dst_reg, src_reg));
2609             src_reg = offset(src_reg, 1);
2610             dst_reg = offset(dst_reg, 1);
2611          }
2612       }
2613    } else if (ir->type->is_record()) {
2614       foreach_in_list(ir_constant, field, &ir->components) {
2615          const unsigned size = type_size(field->type);
2616
2617          field->accept(this);
2618          fs_reg src_reg = this->result;
2619
2620          dst_reg.type = src_reg.type;
2621          for (unsigned j = 0; j < size; j++) {
2622             emit(MOV(dst_reg, src_reg));
2623             src_reg = offset(src_reg, 1);
2624             dst_reg = offset(dst_reg, 1);
2625          }
2626       }
2627    } else {
2628       const unsigned size = type_size(ir->type);
2629
2630       for (unsigned i = 0; i < size; i++) {
2631          switch (ir->type->base_type) {
2632          case GLSL_TYPE_FLOAT:
2633             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
2634             break;
2635          case GLSL_TYPE_UINT:
2636             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
2637             break;
2638          case GLSL_TYPE_INT:
2639             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
2640             break;
2641          case GLSL_TYPE_BOOL:
2642             emit(MOV(dst_reg, fs_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2643             break;
2644          default:
2645             unreachable("Non-float/uint/int/bool constant");
2646          }
2647          dst_reg = offset(dst_reg, 1);
2648       }
2649    }
2650
2651    this->result = reg;
2652 }
2653
2654 void
2655 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
2656 {
2657    ir_expression *expr = ir->as_expression();
2658
2659    if (!expr || expr->operation == ir_binop_ubo_load) {
2660       ir->accept(this);
2661
2662       fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
2663       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2664       return;
2665    }
2666
2667    fs_reg op[3];
2668
2669    assert(expr->get_num_operands() <= 3);
2670    for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2671       assert(expr->operands[i]->type->is_scalar());
2672
2673       expr->operands[i]->accept(this);
2674       op[i] = this->result;
2675
2676       resolve_ud_negate(&op[i]);
2677    }
2678
2679    emit_bool_to_cond_code_of_reg(expr, op);
2680 }
2681
2682 void
2683 fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
2684 {
2685    fs_inst *inst;
2686
2687    switch (expr->operation) {
2688    case ir_unop_logic_not:
2689       inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
2690       inst->conditional_mod = BRW_CONDITIONAL_Z;
2691       break;
2692
2693    case ir_binop_logic_xor:
2694       if (brw->gen <= 5) {
2695          fs_reg temp = vgrf(expr->type);
2696          emit(XOR(temp, op[0], op[1]));
2697          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2698       } else {
2699          inst = emit(XOR(reg_null_d, op[0], op[1]));
2700       }
2701       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2702       break;
2703
2704    case ir_binop_logic_or:
2705       if (brw->gen <= 5) {
2706          fs_reg temp = vgrf(expr->type);
2707          emit(OR(temp, op[0], op[1]));
2708          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2709       } else {
2710          inst = emit(OR(reg_null_d, op[0], op[1]));
2711       }
2712       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2713       break;
2714
2715    case ir_binop_logic_and:
2716       if (brw->gen <= 5) {
2717          fs_reg temp = vgrf(expr->type);
2718          emit(AND(temp, op[0], op[1]));
2719          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2720       } else {
2721          inst = emit(AND(reg_null_d, op[0], op[1]));
2722       }
2723       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2724       break;
2725
2726    case ir_unop_f2b:
2727       if (brw->gen >= 6) {
2728          emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
2729       } else {
2730          inst = emit(MOV(reg_null_f, op[0]));
2731          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2732       }
2733       break;
2734
2735    case ir_unop_i2b:
2736       if (brw->gen >= 6) {
2737          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2738       } else {
2739          inst = emit(MOV(reg_null_d, op[0]));
2740          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2741       }
2742       break;
2743
2744    case ir_binop_greater:
2745    case ir_binop_gequal:
2746    case ir_binop_less:
2747    case ir_binop_lequal:
2748    case ir_binop_equal:
2749    case ir_binop_all_equal:
2750    case ir_binop_nequal:
2751    case ir_binop_any_nequal:
2752       if (brw->gen <= 5) {
2753          resolve_bool_comparison(expr->operands[0], &op[0]);
2754          resolve_bool_comparison(expr->operands[1], &op[1]);
2755       }
2756
2757       emit(CMP(reg_null_d, op[0], op[1],
2758                brw_conditional_for_comparison(expr->operation)));
2759       break;
2760
2761    case ir_triop_csel: {
2762       /* Expand the boolean condition into the flag register. */
2763       inst = emit(MOV(reg_null_d, op[0]));
2764       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2765
2766       /* Select which boolean to return. */
2767       fs_reg temp = vgrf(expr->operands[1]->type);
2768       inst = emit(SEL(temp, op[1], op[2]));
2769       inst->predicate = BRW_PREDICATE_NORMAL;
2770
2771       /* Expand the result to a condition code. */
2772       inst = emit(MOV(reg_null_d, temp));
2773       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2774       break;
2775    }
2776
2777    default:
2778       unreachable("not reached");
2779    }
2780 }
2781
2782 /**
2783  * Emit a gen6 IF statement with the comparison folded into the IF
2784  * instruction.
2785  */
2786 void
2787 fs_visitor::emit_if_gen6(ir_if *ir)
2788 {
2789    ir_expression *expr = ir->condition->as_expression();
2790
2791    if (expr && expr->operation != ir_binop_ubo_load) {
2792       fs_reg op[3];
2793       fs_inst *inst;
2794       fs_reg temp;
2795
2796       assert(expr->get_num_operands() <= 3);
2797       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2798          assert(expr->operands[i]->type->is_scalar());
2799
2800          expr->operands[i]->accept(this);
2801          op[i] = this->result;
2802       }
2803
2804       switch (expr->operation) {
2805       case ir_unop_logic_not:
2806          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
2807          return;
2808
2809       case ir_binop_logic_xor:
2810          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
2811          return;
2812
2813       case ir_binop_logic_or:
2814          temp = vgrf(glsl_type::bool_type);
2815          emit(OR(temp, op[0], op[1]));
2816          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2817          return;
2818
2819       case ir_binop_logic_and:
2820          temp = vgrf(glsl_type::bool_type);
2821          emit(AND(temp, op[0], op[1]));
2822          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2823          return;
2824
2825       case ir_unop_f2b:
2826          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2827          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2828          return;
2829
2830       case ir_unop_i2b:
2831          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2832          return;
2833
2834       case ir_binop_greater:
2835       case ir_binop_gequal:
2836       case ir_binop_less:
2837       case ir_binop_lequal:
2838       case ir_binop_equal:
2839       case ir_binop_all_equal:
2840       case ir_binop_nequal:
2841       case ir_binop_any_nequal:
2842          if (brw->gen <= 5) {
2843             resolve_bool_comparison(expr->operands[0], &op[0]);
2844             resolve_bool_comparison(expr->operands[1], &op[1]);
2845          }
2846
2847          emit(IF(op[0], op[1],
2848                  brw_conditional_for_comparison(expr->operation)));
2849          return;
2850
2851       case ir_triop_csel: {
2852          /* Expand the boolean condition into the flag register. */
2853          fs_inst *inst = emit(MOV(reg_null_d, op[0]));
2854          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2855
2856          /* Select which boolean to use as the result. */
2857          fs_reg temp = vgrf(expr->operands[1]->type);
2858          inst = emit(SEL(temp, op[1], op[2]));
2859          inst->predicate = BRW_PREDICATE_NORMAL;
2860
2861          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2862          return;
2863       }
2864
2865       default:
2866          unreachable("not reached");
2867       }
2868    }
2869
2870    ir->condition->accept(this);
2871    emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
2872 }
2873
2874 bool
2875 fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
2876 {
2877    ir_dereference_variable *deref = ir->condition->as_dereference_variable();
2878    if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
2879       return false;
2880
2881    if (ir->then_instructions.length() != 1 ||
2882        ir->else_instructions.length() != 1)
2883       return false;
2884
2885    ir_assignment *then_assign =
2886          ((ir_instruction *)ir->then_instructions.head)->as_assignment();
2887    ir_assignment *else_assign =
2888          ((ir_instruction *)ir->else_instructions.head)->as_assignment();
2889
2890    if (!then_assign || then_assign->condition ||
2891        !else_assign || else_assign->condition ||
2892        then_assign->write_mask != else_assign->write_mask ||
2893        !then_assign->lhs->equals(else_assign->lhs))
2894       return false;
2895
2896    ir_constant *then_rhs = then_assign->rhs->as_constant();
2897    ir_constant *else_rhs = else_assign->rhs->as_constant();
2898
2899    if (!then_rhs || !else_rhs)
2900       return false;
2901
2902    if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
2903       return false;
2904
2905    if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
2906        (else_rhs->is_one() && then_rhs->is_negative_one())) {
2907       then_assign->lhs->accept(this);
2908       fs_reg dst = this->result;
2909       dst.type = BRW_REGISTER_TYPE_D;
2910       fs_reg tmp = vgrf(glsl_type::int_type);
2911
2912       if (brw->gen >= 6) {
2913          /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
2914          fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
2915
2916          /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
2917           *
2918           *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
2919           *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
2920           *
2921           * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
2922           */
2923
2924          if (then_rhs->is_negative_one()) {
2925             assert(else_rhs->is_one());
2926             g0.negate = true;
2927          }
2928
2929          tmp.type = BRW_REGISTER_TYPE_W;
2930          tmp.subreg_offset = 2;
2931          tmp.stride = 2;
2932
2933          fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
2934          or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
2935
2936          tmp.type = BRW_REGISTER_TYPE_D;
2937          tmp.subreg_offset = 0;
2938          tmp.stride = 1;
2939       } else {
2940          /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
2941          fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
2942
2943          /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
2944           *
2945           *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
2946           *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
2947           *
2948           * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
2949           */
2950
2951          if (then_rhs->is_negative_one()) {
2952             assert(else_rhs->is_one());
2953             g1_6.negate = true;
2954          }
2955
2956          emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
2957       }
2958       emit(AND(dst, tmp, fs_reg(0xbf800000)));
2959       return true;
2960    }
2961
2962    return false;
2963 }
2964
2965 /**
2966  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2967  *
2968  * Many GLSL shaders contain the following pattern:
2969  *
2970  *    x = condition ? foo : bar
2971  *
2972  * The compiler emits an ir_if tree for this, since each subexpression might be
2973  * a complex tree that could have side-effects or short-circuit logic.
2974  *
2975  * However, the common case is to simply select one of two constants or
2976  * variable values---which is exactly what SEL is for.  In this case, the
2977  * assembly looks like:
2978  *
2979  *    (+f0) IF
2980  *    MOV dst src0
2981  *    ELSE
2982  *    MOV dst src1
2983  *    ENDIF
2984  *
2985  * which can be easily translated into:
2986  *
2987  *    (+f0) SEL dst src0 src1
2988  *
2989  * If src0 is an immediate value, we promote it to a temporary GRF.
2990  */
2991 bool
2992 fs_visitor::try_replace_with_sel()
2993 {
2994    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2995    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2996
2997    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2998    int opcodes[] = {
2999       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
3000    };
3001
3002    fs_inst *match = (fs_inst *) endif_inst->prev;
3003    for (int i = 0; i < 4; i++) {
3004       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
3005          return false;
3006       match = (fs_inst *) match->prev;
3007    }
3008
3009    /* The opcodes match; it looks like the right sequence of instructions. */
3010    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
3011    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
3012    fs_inst *if_inst = (fs_inst *) then_mov->prev;
3013
3014    /* Check that the MOVs are the right form. */
3015    if (then_mov->dst.equals(else_mov->dst) &&
3016        !then_mov->is_partial_write() &&
3017        !else_mov->is_partial_write()) {
3018
3019       /* Remove the matched instructions; we'll emit a SEL to replace them. */
3020       while (!if_inst->next->is_tail_sentinel())
3021          if_inst->next->exec_node::remove();
3022       if_inst->exec_node::remove();
3023
3024       /* Only the last source register can be a constant, so if the MOV in
3025        * the "then" clause uses a constant, we need to put it in a temporary.
3026        */
3027       fs_reg src0(then_mov->src[0]);
3028       if (src0.file == IMM) {
3029          src0 = vgrf(glsl_type::float_type);
3030          src0.type = then_mov->src[0].type;
3031          emit(MOV(src0, then_mov->src[0]));
3032       }
3033
3034       fs_inst *sel;
3035       if (if_inst->conditional_mod) {
3036          /* Sandybridge-specific IF with embedded comparison */
3037          emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
3038                   if_inst->conditional_mod));
3039          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
3040          sel->predicate = BRW_PREDICATE_NORMAL;
3041       } else {
3042          /* Separate CMP and IF instructions */
3043          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
3044          sel->predicate = if_inst->predicate;
3045          sel->predicate_inverse = if_inst->predicate_inverse;
3046       }
3047
3048       return true;
3049    }
3050
3051    return false;
3052 }
3053
3054 void
3055 fs_visitor::visit(ir_if *ir)
3056 {
3057    if (try_opt_frontfacing_ternary(ir))
3058       return;
3059
3060    /* Don't point the annotation at the if statement, because then it plus
3061     * the then and else blocks get printed.
3062     */
3063    this->base_ir = ir->condition;
3064
3065    if (brw->gen == 6) {
3066       emit_if_gen6(ir);
3067    } else {
3068       emit_bool_to_cond_code(ir->condition);
3069
3070       emit(IF(BRW_PREDICATE_NORMAL));
3071    }
3072
3073    foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
3074       this->base_ir = ir_;
3075       ir_->accept(this);
3076    }
3077
3078    if (!ir->else_instructions.is_empty()) {
3079       emit(BRW_OPCODE_ELSE);
3080
3081       foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
3082          this->base_ir = ir_;
3083          ir_->accept(this);
3084       }
3085    }
3086
3087    emit(BRW_OPCODE_ENDIF);
3088
3089    if (!try_replace_with_sel() && brw->gen < 6) {
3090       no16("Can't support (non-uniform) control flow on SIMD16\n");
3091    }
3092 }
3093
3094 void
3095 fs_visitor::visit(ir_loop *ir)
3096 {
3097    if (brw->gen < 6) {
3098       no16("Can't support (non-uniform) control flow on SIMD16\n");
3099    }
3100
3101    this->base_ir = NULL;
3102    emit(BRW_OPCODE_DO);
3103
3104    foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
3105       this->base_ir = ir_;
3106       ir_->accept(this);
3107    }
3108
3109    this->base_ir = NULL;
3110    emit(BRW_OPCODE_WHILE);
3111 }
3112
3113 void
3114 fs_visitor::visit(ir_loop_jump *ir)
3115 {
3116    switch (ir->mode) {
3117    case ir_loop_jump::jump_break:
3118       emit(BRW_OPCODE_BREAK);
3119       break;
3120    case ir_loop_jump::jump_continue:
3121       emit(BRW_OPCODE_CONTINUE);
3122       break;
3123    }
3124 }
3125
3126 void
3127 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
3128 {
3129    ir_dereference *deref = static_cast<ir_dereference *>(
3130       ir->actual_parameters.get_head());
3131    ir_variable *location = deref->variable_referenced();
3132    unsigned surf_index = (stage_prog_data->binding_table.abo_start +
3133                           location->data.binding);
3134
3135    /* Calculate the surface offset */
3136    fs_reg offset = vgrf(glsl_type::uint_type);
3137    ir_dereference_array *deref_array = deref->as_dereference_array();
3138
3139    if (deref_array) {
3140       deref_array->array_index->accept(this);
3141
3142       fs_reg tmp = vgrf(glsl_type::uint_type);
3143       emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
3144       emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
3145    } else {
3146       offset = fs_reg(location->data.atomic.offset);
3147    }
3148
3149    /* Emit the appropriate machine instruction */
3150    const char *callee = ir->callee->function_name();
3151    ir->return_deref->accept(this);
3152    fs_reg dst = this->result;
3153
3154    if (!strcmp("__intrinsic_atomic_read", callee)) {
3155       emit_untyped_surface_read(surf_index, dst, offset);
3156
3157    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
3158       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
3159                           fs_reg(), fs_reg());
3160
3161    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
3162       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
3163                           fs_reg(), fs_reg());
3164    }
3165 }
3166
3167 void
3168 fs_visitor::visit(ir_call *ir)
3169 {
3170    const char *callee = ir->callee->function_name();
3171
3172    if (!strcmp("__intrinsic_atomic_read", callee) ||
3173        !strcmp("__intrinsic_atomic_increment", callee) ||
3174        !strcmp("__intrinsic_atomic_predecrement", callee)) {
3175       visit_atomic_counter_intrinsic(ir);
3176    } else {
3177       unreachable("Unsupported intrinsic.");
3178    }
3179 }
3180
3181 void
3182 fs_visitor::visit(ir_return *)
3183 {
3184    unreachable("FINISHME");
3185 }
3186
3187 void
3188 fs_visitor::visit(ir_function *ir)
3189 {
3190    /* Ignore function bodies other than main() -- we shouldn't see calls to
3191     * them since they should all be inlined before we get to ir_to_mesa.
3192     */
3193    if (strcmp(ir->name, "main") == 0) {
3194       const ir_function_signature *sig;
3195       exec_list empty;
3196
3197       sig = ir->matching_signature(NULL, &empty, false);
3198
3199       assert(sig);
3200
3201       foreach_in_list(ir_instruction, ir_, &sig->body) {
3202          this->base_ir = ir_;
3203          ir_->accept(this);
3204       }
3205    }
3206 }
3207
3208 void
3209 fs_visitor::visit(ir_function_signature *)
3210 {
3211    unreachable("not reached");
3212 }
3213
3214 void
3215 fs_visitor::visit(ir_emit_vertex *)
3216 {
3217    unreachable("not reached");
3218 }
3219
3220 void
3221 fs_visitor::visit(ir_end_primitive *)
3222 {
3223    unreachable("not reached");
3224 }
3225
3226 void
3227 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3228                                 fs_reg dst, fs_reg offset, fs_reg src0,
3229                                 fs_reg src1)
3230 {
3231    int reg_width = dispatch_width / 8;
3232    int length = 0;
3233
3234    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
3235
3236    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3237    /* Initialize the sample mask in the message header. */
3238    emit(MOV(sources[0], fs_reg(0u)))
3239       ->force_writemask_all = true;
3240
3241    if (stage == MESA_SHADER_FRAGMENT) {
3242       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
3243          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
3244             ->force_writemask_all = true;
3245       } else {
3246          emit(MOV(component(sources[0], 7),
3247                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3248             ->force_writemask_all = true;
3249       }
3250    } else {
3251       /* The execution mask is part of the side-band information sent together with
3252        * the message payload to the data port. It's implicitly ANDed with the sample
3253        * mask sent in the header to compute the actual set of channels that execute
3254        * the atomic operation.
3255        */
3256       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
3257       emit(MOV(component(sources[0], 7),
3258                fs_reg(0xffffu)))->force_writemask_all = true;
3259    }
3260    length++;
3261
3262    /* Set the atomic operation offset. */
3263    sources[1] = vgrf(glsl_type::uint_type);
3264    emit(MOV(sources[1], offset));
3265    length++;
3266
3267    /* Set the atomic operation arguments. */
3268    if (src0.file != BAD_FILE) {
3269       sources[length] = vgrf(glsl_type::uint_type);
3270       emit(MOV(sources[length], src0));
3271       length++;
3272    }
3273
3274    if (src1.file != BAD_FILE) {
3275       sources[length] = vgrf(glsl_type::uint_type);
3276       emit(MOV(sources[length], src1));
3277       length++;
3278    }
3279
3280    int mlen = 1 + (length - 1) * reg_width;
3281    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3282                                BRW_REGISTER_TYPE_UD);
3283    emit(LOAD_PAYLOAD(src_payload, sources, length));
3284
3285    /* Emit the instruction. */
3286    fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
3287                         fs_reg(atomic_op), fs_reg(surf_index));
3288    inst->mlen = mlen;
3289 }
3290
3291 void
3292 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
3293                                       fs_reg offset)
3294 {
3295    int reg_width = dispatch_width / 8;
3296
3297    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
3298
3299    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3300    /* Initialize the sample mask in the message header. */
3301    emit(MOV(sources[0], fs_reg(0u)))
3302       ->force_writemask_all = true;
3303
3304    if (stage == MESA_SHADER_FRAGMENT) {
3305       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
3306          emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
3307             ->force_writemask_all = true;
3308       } else {
3309          emit(MOV(component(sources[0], 7),
3310                   retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3311             ->force_writemask_all = true;
3312       }
3313    } else {
3314       /* The execution mask is part of the side-band information sent together with
3315        * the message payload to the data port. It's implicitly ANDed with the sample
3316        * mask sent in the header to compute the actual set of channels that execute
3317        * the atomic operation.
3318        */
3319       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
3320       emit(MOV(component(sources[0], 7),
3321                fs_reg(0xffffu)))->force_writemask_all = true;
3322    }
3323
3324    /* Set the surface read offset. */
3325    sources[1] = vgrf(glsl_type::uint_type);
3326    emit(MOV(sources[1], offset));
3327
3328    int mlen = 1 + reg_width;
3329    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3330                                BRW_REGISTER_TYPE_UD);
3331    fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
3332
3333    /* Emit the instruction. */
3334    inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
3335                fs_reg(surf_index));
3336    inst->mlen = mlen;
3337 }
3338
3339 fs_inst *
3340 fs_visitor::emit(fs_inst *inst)
3341 {
3342    if (dispatch_width == 16 && inst->exec_size == 8)
3343       inst->force_uncompressed = true;
3344
3345    inst->annotation = this->current_annotation;
3346    inst->ir = this->base_ir;
3347
3348    this->instructions.push_tail(inst);
3349
3350    return inst;
3351 }
3352
3353 void
3354 fs_visitor::emit(exec_list list)
3355 {
3356    foreach_in_list_safe(fs_inst, inst, &list) {
3357       inst->exec_node::remove();
3358       emit(inst);
3359    }
3360 }
3361
3362 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
3363 void
3364 fs_visitor::emit_dummy_fs()
3365 {
3366    int reg_width = dispatch_width / 8;
3367
3368    /* Everyone's favorite color. */
3369    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
3370    for (int i = 0; i < 4; i++) {
3371       emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
3372                       dispatch_width), fs_reg(color[i])));
3373    }
3374
3375    fs_inst *write;
3376    write = emit(FS_OPCODE_FB_WRITE);
3377    write->eot = true;
3378    if (brw->gen >= 6) {
3379       write->base_mrf = 2;
3380       write->mlen = 4 * reg_width;
3381    } else {
3382       write->header_present = true;
3383       write->base_mrf = 0;
3384       write->mlen = 2 + 4 * reg_width;
3385    }
3386
3387    /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
3388     * varying to avoid GPU hangs, so set that.
3389     */
3390    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3391    wm_prog_data->num_varying_inputs = brw->gen < 6 ? 1 : 0;
3392    memset(wm_prog_data->urb_setup, -1,
3393           sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
3394
3395    /* We don't have any uniforms. */
3396    stage_prog_data->nr_params = 0;
3397    stage_prog_data->nr_pull_params = 0;
3398    stage_prog_data->curb_read_length = 0;
3399    stage_prog_data->dispatch_grf_start_reg = 2;
3400    wm_prog_data->dispatch_grf_start_reg_16 = 2;
3401    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
3402
3403    calculate_cfg();
3404 }
3405
3406 /* The register location here is relative to the start of the URB
3407  * data.  It will get adjusted to be a real location before
3408  * generate_code() time.
3409  */
3410 struct brw_reg
3411 fs_visitor::interp_reg(int location, int channel)
3412 {
3413    assert(stage == MESA_SHADER_FRAGMENT);
3414    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3415    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
3416    int stride = (channel & 1) * 4;
3417
3418    assert(prog_data->urb_setup[location] != -1);
3419
3420    return brw_vec1_grf(regnr, stride);
3421 }
3422
3423 /** Emits the interpolation for the varying inputs. */
3424 void
3425 fs_visitor::emit_interpolation_setup_gen4()
3426 {
3427    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
3428
3429    this->current_annotation = "compute pixel centers";
3430    this->pixel_x = vgrf(glsl_type::uint_type);
3431    this->pixel_y = vgrf(glsl_type::uint_type);
3432    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
3433    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
3434    emit(ADD(this->pixel_x,
3435             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
3436             fs_reg(brw_imm_v(0x10101010))));
3437    emit(ADD(this->pixel_y,
3438             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
3439             fs_reg(brw_imm_v(0x11001100))));
3440
3441    this->current_annotation = "compute pixel deltas from v0";
3442
3443    this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3444       vgrf(glsl_type::vec2_type);
3445    const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
3446    const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
3447    const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
3448
3449    if (brw->has_pln && dispatch_width == 16) {
3450       emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
3451       emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
3452       emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
3453          ->force_sechalf = true;
3454       emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
3455          ->force_sechalf = true;
3456    } else {
3457       emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
3458       emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
3459    }
3460
3461    this->current_annotation = "compute pos.w and 1/pos.w";
3462    /* Compute wpos.w.  It's always in our setup, since it's needed to
3463     * interpolate the other attributes.
3464     */
3465    this->wpos_w = vgrf(glsl_type::float_type);
3466    emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
3467    /* Compute the pixel 1/W value from wpos.w. */
3468    this->pixel_w = vgrf(glsl_type::float_type);
3469    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
3470    this->current_annotation = NULL;
3471 }
3472
3473 /** Emits the interpolation for the varying inputs. */
3474 void
3475 fs_visitor::emit_interpolation_setup_gen6()
3476 {
3477    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
3478
3479    this->current_annotation = "compute pixel centers";
3480    if (brw->gen >= 8 || dispatch_width == 8) {
3481       /* The "Register Region Restrictions" page says for BDW (and newer,
3482        * presumably):
3483        *
3484        *     "When destination spans two registers, the source may be one or
3485        *      two registers. The destination elements must be evenly split
3486        *      between the two registers."
3487        *
3488        * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
3489        * compute our pixel centers.
3490        */
3491       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
3492                           BRW_REGISTER_TYPE_UW, dispatch_width * 2);
3493       emit(ADD(int_pixel_xy,
3494                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
3495                fs_reg(brw_imm_v(0x11001010))))
3496          ->force_writemask_all = true;
3497
3498       this->pixel_x = vgrf(glsl_type::float_type);
3499       this->pixel_y = vgrf(glsl_type::float_type);
3500       emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
3501       emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
3502    } else {
3503       /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
3504        *
3505        *     "When destination spans two registers, the source MUST span two
3506        *      registers."
3507        *
3508        * Since the GRF source of the ADD will only read a single register, we
3509        * must do two separate ADDs in SIMD16.
3510        */
3511       fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
3512       fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
3513       int_pixel_x.type = BRW_REGISTER_TYPE_UW;
3514       int_pixel_y.type = BRW_REGISTER_TYPE_UW;
3515       emit(ADD(int_pixel_x,
3516                fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
3517                fs_reg(brw_imm_v(0x10101010))));
3518       emit(ADD(int_pixel_y,
3519                fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
3520                fs_reg(brw_imm_v(0x11001100))));
3521
3522       /* As of gen6, we can no longer mix float and int sources.  We have
3523        * to turn the integer pixel centers into floats for their actual
3524        * use.
3525        */
3526       this->pixel_x = vgrf(glsl_type::float_type);
3527       this->pixel_y = vgrf(glsl_type::float_type);
3528       emit(MOV(this->pixel_x, int_pixel_x));
3529       emit(MOV(this->pixel_y, int_pixel_y));
3530    }
3531
3532    this->current_annotation = "compute pos.w";
3533    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
3534    this->wpos_w = vgrf(glsl_type::float_type);
3535    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
3536
3537    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3538       uint8_t reg = payload.barycentric_coord_reg[i];
3539       this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
3540    }
3541
3542    this->current_annotation = NULL;
3543 }
3544
3545 int
3546 fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
3547                                 bool use_2nd_half)
3548 {
3549    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3550    fs_inst *inst;
3551
3552    if (color.file == BAD_FILE) {
3553       return 4 * (dispatch_width / 8);
3554    }
3555
3556    uint8_t colors_enabled;
3557    if (components == 0) {
3558       /* We want to write one component to the alpha channel */
3559       colors_enabled = 0x8;
3560    } else {
3561       /* Enable the first components-many channels */
3562       colors_enabled = (1 << components) - 1;
3563    }
3564
3565    if (dispatch_width == 8 || (brw->gen >= 6 && !do_dual_src)) {
3566       /* SIMD8 write looks like:
3567        * m + 0: r0
3568        * m + 1: r1
3569        * m + 2: g0
3570        * m + 3: g1
3571        *
3572        * gen6 SIMD16 DP write looks like:
3573        * m + 0: r0
3574        * m + 1: r1
3575        * m + 2: g0
3576        * m + 3: g1
3577        * m + 4: b0
3578        * m + 5: b1
3579        * m + 6: a0
3580        * m + 7: a1
3581        */
3582       int len = 0;
3583       for (unsigned i = 0; i < 4; ++i) {
3584          if (colors_enabled & (1 << i)) {
3585             dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
3586                               color.type, color.width);
3587             inst = emit(MOV(dst[len], offset(color, i)));
3588             inst->saturate = key->clamp_fragment_color;
3589          } else if (color.width == 16) {
3590             /* We need two BAD_FILE slots for a 16-wide color */
3591             len++;
3592          }
3593          len++;
3594       }
3595       return len;
3596    } else if (brw->gen >= 6 && do_dual_src) {
3597       /* SIMD16 dual source blending for gen6+.
3598        *
3599        * From the SNB PRM, volume 4, part 1, page 193:
3600        *
3601        * "The dual source render target messages only have SIMD8 forms due to
3602        *  maximum message length limitations. SIMD16 pixel shaders must send two
3603        *  of these messages to cover all of the pixels. Each message contains
3604        *  two colors (4 channels each) for each pixel in the message payload."
3605        *
3606        * So in SIMD16 dual source blending we will send 2 SIMD8 messages,
3607        * each one will call this function twice (one for each color involved),
3608        * so in each pass we only write 4 registers. Notice that the second
3609        * SIMD8 message needs to read color data from the 2nd half of the color
3610        * registers, so it needs to call this with use_2nd_half = true.
3611        */
3612       for (unsigned i = 0; i < 4; ++i) {
3613          if (colors_enabled & (1 << i)) {
3614             dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
3615             inst = emit(MOV(dst[i], half(offset(color, i),
3616                                          use_2nd_half ? 1 : 0)));
3617             inst->saturate = key->clamp_fragment_color;
3618             if (use_2nd_half)
3619                inst->force_sechalf = true;
3620          }
3621       }
3622       return 4;
3623    } else {
3624       /* pre-gen6 SIMD16 single source DP write looks like:
3625        * m + 0: r0
3626        * m + 1: g0
3627        * m + 2: b0
3628        * m + 3: a0
3629        * m + 4: r1
3630        * m + 5: g1
3631        * m + 6: b1
3632        * m + 7: a1
3633        */
3634       for (unsigned i = 0; i < 4; ++i) {
3635          if (colors_enabled & (1 << i)) {
3636             dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
3637             inst = emit(MOV(dst[i], half(offset(color, i), 0)));
3638             inst->saturate = key->clamp_fragment_color;
3639
3640             dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
3641             inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
3642             inst->saturate = key->clamp_fragment_color;
3643             inst->force_sechalf = true;
3644          }
3645       }
3646       return 8;
3647    }
3648 }
3649
3650 static enum brw_conditional_mod
3651 cond_for_alpha_func(GLenum func)
3652 {
3653    switch(func) {
3654       case GL_GREATER:
3655          return BRW_CONDITIONAL_G;
3656       case GL_GEQUAL:
3657          return BRW_CONDITIONAL_GE;
3658       case GL_LESS:
3659          return BRW_CONDITIONAL_L;
3660       case GL_LEQUAL:
3661          return BRW_CONDITIONAL_LE;
3662       case GL_EQUAL:
3663          return BRW_CONDITIONAL_EQ;
3664       case GL_NOTEQUAL:
3665          return BRW_CONDITIONAL_NEQ;
3666       default:
3667          unreachable("Not reached");
3668    }
3669 }
3670
3671 /**
3672  * Alpha test support for when we compile it into the shader instead
3673  * of using the normal fixed-function alpha test.
3674  */
3675 void
3676 fs_visitor::emit_alpha_test()
3677 {
3678    assert(stage == MESA_SHADER_FRAGMENT);
3679    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3680    this->current_annotation = "Alpha test";
3681
3682    fs_inst *cmp;
3683    if (key->alpha_test_func == GL_ALWAYS)
3684       return;
3685
3686    if (key->alpha_test_func == GL_NEVER) {
3687       /* f0.1 = 0 */
3688       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3689                                       BRW_REGISTER_TYPE_UW));
3690       cmp = emit(CMP(reg_null_f, some_reg, some_reg,
3691                      BRW_CONDITIONAL_NEQ));
3692    } else {
3693       /* RT0 alpha */
3694       fs_reg color = offset(outputs[0], 3);
3695
3696       /* f0.1 &= func(color, ref) */
3697       cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
3698                      cond_for_alpha_func(key->alpha_test_func)));
3699    }
3700    cmp->predicate = BRW_PREDICATE_NORMAL;
3701    cmp->flag_subreg = 1;
3702 }
3703
3704 fs_inst *
3705 fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
3706                                  fs_reg src0_alpha, unsigned components,
3707                                  bool use_2nd_half)
3708 {
3709    assert(stage == MESA_SHADER_FRAGMENT);
3710    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3711    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3712
3713    this->current_annotation = "FB write header";
3714    bool header_present = true;
3715    int reg_size = dispatch_width / 8;
3716
3717    /* We can potentially have a message length of up to 15, so we have to set
3718     * base_mrf to either 0 or 1 in order to fit in m0..m15.
3719     */
3720    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
3721    int length = 0;
3722
3723    /* From the Sandy Bridge PRM, volume 4, page 198:
3724     *
3725     *     "Dispatched Pixel Enables. One bit per pixel indicating
3726     *      which pixels were originally enabled when the thread was
3727     *      dispatched. This field is only required for the end-of-
3728     *      thread message and on all dual-source messages."
3729     */
3730    if (brw->gen >= 6 &&
3731        (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
3732        color1.file == BAD_FILE &&
3733        key->nr_color_regions == 1) {
3734       header_present = false;
3735    }
3736
3737    if (header_present)
3738       /* Allocate 2 registers for a header */
3739       length += 2;
3740
3741    if (payload.aa_dest_stencil_reg) {
3742       sources[length] = fs_reg(GRF, alloc.allocate(1));
3743       emit(MOV(sources[length],
3744                fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
3745       length++;
3746    }
3747
3748    prog_data->uses_omask =
3749       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
3750    if (prog_data->uses_omask) {
3751       this->current_annotation = "FB write oMask";
3752       assert(this->sample_mask.file != BAD_FILE);
3753       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
3754        * it's unsinged single words, one vgrf is always 16-wide.
3755        */
3756       sources[length] = fs_reg(GRF, alloc.allocate(1),
3757                                BRW_REGISTER_TYPE_UW, 16);
3758       emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
3759       length++;
3760    }
3761
3762    if (color0.file == BAD_FILE) {
3763       /* Even if there's no color buffers enabled, we still need to send
3764        * alpha out the pipeline to our null renderbuffer to support
3765        * alpha-testing, alpha-to-coverage, and so on.
3766        */
3767       length += setup_color_payload(sources + length, this->outputs[0], 0,
3768                                     false);
3769    } else if (color1.file == BAD_FILE) {
3770       if (src0_alpha.file != BAD_FILE) {
3771          sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
3772                                   src0_alpha.type, src0_alpha.width);
3773          fs_inst *inst = emit(MOV(sources[length], src0_alpha));
3774          inst->saturate = key->clamp_fragment_color;
3775          length++;
3776       }
3777
3778       length += setup_color_payload(sources + length, color0, components,
3779                                     false);
3780    } else {
3781       length += setup_color_payload(sources + length, color0, components,
3782                                     use_2nd_half);
3783       length += setup_color_payload(sources + length, color1, components,
3784                                     use_2nd_half);
3785    }
3786
3787    if (source_depth_to_render_target) {
3788       if (brw->gen == 6) {
3789          /* For outputting oDepth on gen6, SIMD8 writes have to be
3790           * used.  This would require SIMD8 moves of each half to
3791           * message regs, kind of like pre-gen5 SIMD16 FB writes.
3792           * Just bail on doing so for now.
3793           */
3794          no16("Missing support for simd16 depth writes on gen6\n");
3795       }
3796
3797       sources[length] = vgrf(glsl_type::float_type);
3798       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3799          /* Hand over gl_FragDepth. */
3800          assert(this->frag_depth.file != BAD_FILE);
3801          emit(MOV(sources[length], this->frag_depth));
3802       } else {
3803          /* Pass through the payload depth. */
3804          emit(MOV(sources[length],
3805                   fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
3806       }
3807       length++;
3808    }
3809
3810    if (payload.dest_depth_reg) {
3811       sources[length] = vgrf(glsl_type::float_type);
3812       emit(MOV(sources[length],
3813                fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
3814       length++;
3815    }
3816
3817    fs_inst *load;
3818    fs_inst *write;
3819    if (brw->gen >= 7) {
3820       /* Send from the GRF */
3821       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
3822       load = emit(LOAD_PAYLOAD(payload, sources, length));
3823       payload.reg = alloc.allocate(load->regs_written);
3824       payload.width = dispatch_width;
3825       load->dst = payload;
3826       write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
3827       write->base_mrf = -1;
3828    } else {
3829       /* Send from the MRF */
3830       load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3831                                sources, length));
3832       write = emit(FS_OPCODE_FB_WRITE);
3833       write->exec_size = dispatch_width;
3834       write->base_mrf = 1;
3835    }
3836
3837    write->mlen = load->regs_written;
3838    write->header_present = header_present;
3839    if (prog_data->uses_kill) {
3840       write->predicate = BRW_PREDICATE_NORMAL;
3841       write->flag_subreg = 1;
3842    }
3843    return write;
3844 }
3845
3846 void
3847 fs_visitor::emit_fb_writes()
3848 {
3849    assert(stage == MESA_SHADER_FRAGMENT);
3850    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3851    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3852
3853    fs_inst *inst = NULL;
3854    if (do_dual_src) {
3855       this->current_annotation = ralloc_asprintf(this->mem_ctx,
3856                                                  "FB dual-source write");
3857       inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
3858                                   reg_undef, 4);
3859       inst->target = 0;
3860
3861       /* SIMD16 dual source blending requires to send two SIMD8 dual source
3862        * messages, where each message contains color data for 8 pixels. Color
3863        * data for the first group of pixels is stored in the "lower" half of
3864        * the color registers, so in SIMD16, the previous message did:
3865        * m + 0: r0
3866        * m + 1: g0
3867        * m + 2: b0
3868        * m + 3: a0
3869        *
3870        * Here goes the second message, which packs color data for the
3871        * remaining 8 pixels. Color data for these pixels is stored in the
3872        * "upper" half of the color registers, so we need to do:
3873        * m + 0: r1
3874        * m + 1: g1
3875        * m + 2: b1
3876        * m + 3: a1
3877        */
3878       if (dispatch_width == 16) {
3879          inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
3880                                      reg_undef, 4, true);
3881          inst->target = 0;
3882       }
3883
3884       prog_data->dual_src_blend = true;
3885    } else {
3886       for (int target = 0; target < key->nr_color_regions; target++) {
3887          /* Skip over outputs that weren't written. */
3888          if (this->outputs[target].file == BAD_FILE)
3889             continue;
3890
3891          this->current_annotation = ralloc_asprintf(this->mem_ctx,
3892                                                     "FB write target %d",
3893                                                     target);
3894          fs_reg src0_alpha;
3895          if (brw->gen >= 6 && key->replicate_alpha && target != 0)
3896             src0_alpha = offset(outputs[0], 3);
3897
3898          inst = emit_single_fb_write(this->outputs[target], reg_undef,
3899                                      src0_alpha,
3900                                      this->output_components[target]);
3901          inst->target = target;
3902       }
3903    }
3904
3905    if (inst == NULL) {
3906       /* Even if there's no color buffers enabled, we still need to send
3907        * alpha out the pipeline to our null renderbuffer to support
3908        * alpha-testing, alpha-to-coverage, and so on.
3909        */
3910       inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
3911       inst->target = 0;
3912    }
3913
3914    inst->eot = true;
3915    this->current_annotation = NULL;
3916 }
3917
3918 void
3919 fs_visitor::setup_uniform_clipplane_values()
3920 {
3921    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
3922    const struct brw_vue_prog_key *key =
3923       (const struct brw_vue_prog_key *) this->key;
3924
3925    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3926       this->userplane[i] = fs_reg(UNIFORM, uniforms);
3927       for (int j = 0; j < 4; ++j) {
3928          stage_prog_data->param[uniforms + j] =
3929             (gl_constant_value *) &clip_planes[i][j];
3930       }
3931       uniforms += 4;
3932    }
3933 }
3934
3935 void fs_visitor::compute_clip_distance()
3936 {
3937    struct brw_vue_prog_data *vue_prog_data =
3938       (struct brw_vue_prog_data *) prog_data;
3939    const struct brw_vue_prog_key *key =
3940       (const struct brw_vue_prog_key *) this->key;
3941
3942    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3943     *
3944     *     "If a linked set of shaders forming the vertex stage contains no
3945     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3946     *     application has requested clipping against user clip planes through
3947     *     the API, then the coordinate written to gl_Position is used for
3948     *     comparison against the user clip planes."
3949     *
3950     * This function is only called if the shader didn't write to
3951     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3952     * if the user wrote to it; otherwise we use gl_Position.
3953     */
3954
3955    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3956    if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
3957       clip_vertex = VARYING_SLOT_POS;
3958
3959    /* If the clip vertex isn't written, skip this.  Typically this means
3960     * the GS will set up clipping. */
3961    if (outputs[clip_vertex].file == BAD_FILE)
3962       return;
3963
3964    setup_uniform_clipplane_values();
3965
3966    current_annotation = "user clip distances";
3967
3968    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
3969    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
3970
3971    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3972       fs_reg u = userplane[i];
3973       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
3974       output.reg_offset = i & 3;
3975
3976       emit(MUL(output, outputs[clip_vertex], u));
3977       for (int j = 1; j < 4; j++) {
3978          u.reg = userplane[i].reg + j;
3979          emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
3980       }
3981    }
3982 }
3983
3984 void
3985 fs_visitor::emit_urb_writes()
3986 {
3987    int slot, urb_offset, length;
3988    struct brw_vs_prog_data *vs_prog_data =
3989       (struct brw_vs_prog_data *) prog_data;
3990    const struct brw_vs_prog_key *key =
3991       (const struct brw_vs_prog_key *) this->key;
3992    const GLbitfield64 psiz_mask =
3993       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
3994    const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
3995    bool flush;
3996    fs_reg sources[8];
3997
3998    /* Lower legacy ff and ClipVertex clipping to clip distances */
3999    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
4000       compute_clip_distance();
4001
4002    /* If we don't have any valid slots to write, just do a minimal urb write
4003     * send to terminate the shader. */
4004    if (vue_map->slots_valid == 0) {
4005
4006       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
4007       fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
4008                                                       BRW_REGISTER_TYPE_UD))));
4009       inst->force_writemask_all = true;
4010
4011       inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
4012       inst->eot = true;
4013       inst->mlen = 1;
4014       inst->offset = 1;
4015       return;
4016    }
4017
4018    length = 0;
4019    urb_offset = 0;
4020    flush = false;
4021    for (slot = 0; slot < vue_map->num_slots; slot++) {
4022       fs_reg reg, src, zero;
4023
4024       int varying = vue_map->slot_to_varying[slot];
4025       switch (varying) {
4026       case VARYING_SLOT_PSIZ:
4027
4028          /* The point size varying slot is the vue header and is always in the
4029           * vue map.  But often none of the special varyings that live there
4030           * are written and in that case we can skip writing to the vue
4031           * header, provided the corresponding state properly clamps the
4032           * values further down the pipeline. */
4033          if ((vue_map->slots_valid & psiz_mask) == 0) {
4034             assert(length == 0);
4035             urb_offset++;
4036             break;
4037          }
4038
4039          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
4040          emit(MOV(zero, fs_reg(0u)));
4041
4042          sources[length++] = zero;
4043          if (vue_map->slots_valid & VARYING_BIT_LAYER)
4044             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
4045          else
4046             sources[length++] = zero;
4047
4048          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
4049             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
4050          else
4051             sources[length++] = zero;
4052
4053          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
4054             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
4055          else
4056             sources[length++] = zero;
4057          break;
4058
4059       case BRW_VARYING_SLOT_NDC:
4060       case VARYING_SLOT_EDGE:
4061          unreachable("unexpected scalar vs output");
4062          break;
4063
4064       case BRW_VARYING_SLOT_PAD:
4065          break;
4066
4067       default:
4068          /* gl_Position is always in the vue map, but isn't always written by
4069           * the shader.  Other varyings (clip distances) get added to the vue
4070           * map but don't always get written.  In those cases, the
4071           * corresponding this->output[] slot will be invalid we and can skip
4072           * the urb write for the varying.  If we've already queued up a vue
4073           * slot for writing we flush a mlen 5 urb write, otherwise we just
4074           * advance the urb_offset.
4075           */
4076          if (this->outputs[varying].file == BAD_FILE) {
4077             if (length > 0)
4078                flush = true;
4079             else
4080                urb_offset++;
4081             break;
4082          }
4083
4084          if ((varying == VARYING_SLOT_COL0 ||
4085               varying == VARYING_SLOT_COL1 ||
4086               varying == VARYING_SLOT_BFC0 ||
4087               varying == VARYING_SLOT_BFC1) &&
4088              key->clamp_vertex_color) {
4089             /* We need to clamp these guys, so do a saturating MOV into a
4090              * temp register and use that for the payload.
4091              */
4092             for (int i = 0; i < 4; i++) {
4093                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
4094                src = offset(this->outputs[varying], i);
4095                fs_inst *inst = emit(MOV(reg, src));
4096                inst->saturate = true;
4097                sources[length++] = reg;
4098             }
4099          } else {
4100             for (int i = 0; i < 4; i++)
4101                sources[length++] = offset(this->outputs[varying], i);
4102          }
4103          break;
4104       }
4105
4106       current_annotation = "URB write";
4107
4108       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
4109        * the last slot or if we need to flush (see BAD_FILE varying case
4110        * above), emit a URB write send now to flush out the data.
4111        */
4112       int last = slot == vue_map->num_slots - 1;
4113       if (length == 8 || last)
4114          flush = true;
4115       if (flush) {
4116          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
4117          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
4118                                  BRW_REGISTER_TYPE_F);
4119
4120          /* We need WE_all on the MOV for the message header (the URB handles)
4121           * so do a MOV to a dummy register and set force_writemask_all on the
4122           * MOV.  LOAD_PAYLOAD will preserve that.
4123           */
4124          fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
4125                                BRW_REGISTER_TYPE_UD);
4126          fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
4127                                                        BRW_REGISTER_TYPE_UD))));
4128          inst->force_writemask_all = true;
4129          payload_sources[0] = dummy;
4130
4131          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
4132          emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
4133
4134          inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
4135          inst->eot = last;
4136          inst->mlen = length + 1;
4137          inst->offset = urb_offset;
4138          urb_offset = slot + 1;
4139          length = 0;
4140          flush = false;
4141       }
4142    }
4143 }
4144
4145 void
4146 fs_visitor::resolve_ud_negate(fs_reg *reg)
4147 {
4148    if (reg->type != BRW_REGISTER_TYPE_UD ||
4149        !reg->negate)
4150       return;
4151
4152    fs_reg temp = vgrf(glsl_type::uint_type);
4153    emit(MOV(temp, *reg));
4154    *reg = temp;
4155 }
4156
4157 /**
4158  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
4159  *
4160  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
4161  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
4162  */
4163 void
4164 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
4165 {
4166    assert(brw->gen <= 5);
4167
4168    if (rvalue->type != glsl_type::bool_type)
4169       return;
4170
4171    fs_reg and_result = vgrf(glsl_type::bool_type);
4172    fs_reg neg_result = vgrf(glsl_type::bool_type);
4173    emit(AND(and_result, *reg, fs_reg(1)));
4174    emit(MOV(neg_result, negate(and_result)));
4175    *reg = neg_result;
4176 }
4177
4178 fs_visitor::fs_visitor(struct brw_context *brw,
4179                        void *mem_ctx,
4180                        const struct brw_wm_prog_key *key,
4181                        struct brw_wm_prog_data *prog_data,
4182                        struct gl_shader_program *shader_prog,
4183                        struct gl_fragment_program *fp,
4184                        unsigned dispatch_width)
4185    : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
4186                      MESA_SHADER_FRAGMENT),
4187      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
4188      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
4189      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
4190      key(key), prog_data(&prog_data->base),
4191      dispatch_width(dispatch_width), promoted_constants(0)
4192 {
4193    this->mem_ctx = mem_ctx;
4194    init();
4195 }
4196
4197 fs_visitor::fs_visitor(struct brw_context *brw,
4198                        void *mem_ctx,
4199                        const struct brw_vs_prog_key *key,
4200                        struct brw_vs_prog_data *prog_data,
4201                        struct gl_shader_program *shader_prog,
4202                        struct gl_vertex_program *cp,
4203                        unsigned dispatch_width)
4204    : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
4205                      MESA_SHADER_VERTEX),
4206      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
4207      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
4208      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
4209      key(key), prog_data(&prog_data->base.base),
4210      dispatch_width(dispatch_width), promoted_constants(0)
4211 {
4212    this->mem_ctx = mem_ctx;
4213    init();
4214 }
4215
4216 void
4217 fs_visitor::init()
4218 {
4219    switch (stage) {
4220    case MESA_SHADER_FRAGMENT:
4221       key_tex = &((const brw_wm_prog_key *) key)->tex;
4222       break;
4223    case MESA_SHADER_VERTEX:
4224    case MESA_SHADER_GEOMETRY:
4225       key_tex = &((const brw_vue_prog_key *) key)->tex;
4226       break;
4227    default:
4228       unreachable("unhandled shader stage");
4229    }
4230
4231    this->failed = false;
4232    this->simd16_unsupported = false;
4233    this->no16_msg = NULL;
4234    this->variable_ht = hash_table_ctor(0,
4235                                        hash_table_pointer_hash,
4236                                        hash_table_pointer_compare);
4237
4238    this->nir_locals = NULL;
4239    this->nir_globals = NULL;
4240
4241    memset(&this->payload, 0, sizeof(this->payload));
4242    memset(this->outputs, 0, sizeof(this->outputs));
4243    memset(this->output_components, 0, sizeof(this->output_components));
4244    this->source_depth_to_render_target = false;
4245    this->runtime_check_aads_emit = false;
4246    this->first_non_payload_grf = 0;
4247    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
4248
4249    this->current_annotation = NULL;
4250    this->base_ir = NULL;
4251
4252    this->virtual_grf_start = NULL;
4253    this->virtual_grf_end = NULL;
4254    this->live_intervals = NULL;
4255    this->regs_live_at_ip = NULL;
4256
4257    this->uniforms = 0;
4258    this->last_scratch = 0;
4259    this->pull_constant_loc = NULL;
4260    this->push_constant_loc = NULL;
4261
4262    this->spilled_any_registers = false;
4263    this->do_dual_src = false;
4264
4265    if (dispatch_width == 8)
4266       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
4267 }
4268
4269 fs_visitor::~fs_visitor()
4270 {
4271    hash_table_dtor(this->variable_ht);
4272 }