src/intel/compiler/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "compiler/glsl/ir.h"
  25 #include "brw_fs.h"
  26 #include "brw_fs_surface_builder.h"
  27 #include "brw_nir.h"
  28 #include "util/u_math.h"
  29 #include "util/bitscan.h"
  30
  31 using namespace brw;
  32 using namespace brw::surface_access;
  33
  34 void
  35 fs_visitor::emit_nir_code()
  36 {
  37    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  38     * be converted to reads/writes of these arrays
  39     */
  40    nir_setup_outputs();
  41    nir_setup_uniforms();
  42    nir_emit_system_values();
  43
  44    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
  45 }
  46
  47 void
  48 fs_visitor::nir_setup_outputs()
  49 {
  50    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
  51       return;
  52
  53    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
  54
  55    /* Calculate the size of output registers in a separate pass, before
  56     * allocating them.  With ARB_enhanced_layouts, multiple output variables
  57     * may occupy the same slot, but have different type sizes.
  58     */
  59    nir_foreach_variable(var, &nir->outputs) {
  60       const int loc = var->data.driver_location;
  61       const unsigned var_vec4s =
  62          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
  63                            : type_size_vec4(var->type);
  64       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
  65    }
  66
  67    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
  68       if (vec4s[loc] == 0) {
  69          loc++;
  70          continue;
  71       }
  72
  73       unsigned reg_size = vec4s[loc];
  74
  75       /* Check if there are any ranges that start within this range and extend
  76        * past it. If so, include them in this allocation.
  77        */
  78       for (unsigned i = 1; i < reg_size; i++)
  79          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
  80
  81       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
  82       for (unsigned i = 0; i < reg_size; i++)
  83          outputs[loc + i] = offset(reg, bld, 4 * i);
  84
  85       loc += reg_size;
  86    }
  87 }
  88
  89 void
  90 fs_visitor::nir_setup_uniforms()
  91 {
  92    /* Only the first compile gets to set up uniforms. */
  93    if (push_constant_loc) {
  94       assert(pull_constant_loc);
  95       return;
  96    }
  97
  98    uniforms = nir->num_uniforms / 4;
  99
 100    if (stage == MESA_SHADER_COMPUTE) {
 101       /* Add a uniform for the thread local id.  It must be the last uniform
 102        * on the list.
 103        */
 104       assert(uniforms == prog_data->nr_params);
 105       uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
 106       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
 107       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
 108    }
 109 }
 110
 111 static bool
 112 emit_system_values_block(nir_block *block, fs_visitor *v)
 113 {
 114    fs_reg *reg;
 115
 116    nir_foreach_instr(instr, block) {
 117       if (instr->type != nir_instr_type_intrinsic)
 118          continue;
 119
 120       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 121       switch (intrin->intrinsic) {
 122       case nir_intrinsic_load_vertex_id:
 123       case nir_intrinsic_load_base_vertex:
 124          unreachable("should be lowered by nir_lower_system_values().");
 125
 126       case nir_intrinsic_load_vertex_id_zero_base:
 127       case nir_intrinsic_load_is_indexed_draw:
 128       case nir_intrinsic_load_first_vertex:
 129       case nir_intrinsic_load_instance_id:
 130       case nir_intrinsic_load_base_instance:
 131       case nir_intrinsic_load_draw_id:
 132          unreachable("should be lowered by brw_nir_lower_vs_inputs().");
 133
 134       case nir_intrinsic_load_invocation_id:
 135          if (v->stage == MESA_SHADER_TESS_CTRL)
 136             break;
 137          assert(v->stage == MESA_SHADER_GEOMETRY);
 138          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 139          if (reg->file == BAD_FILE) {
 140             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 141             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 142             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 143             abld.SHR(iid, g1, brw_imm_ud(27u));
 144             *reg = iid;
 145          }
 146          break;
 147
 148       case nir_intrinsic_load_sample_pos:
 149          assert(v->stage == MESA_SHADER_FRAGMENT);
 150          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 151          if (reg->file == BAD_FILE)
 152             *reg = *v->emit_samplepos_setup();
 153          break;
 154
 155       case nir_intrinsic_load_sample_id:
 156          assert(v->stage == MESA_SHADER_FRAGMENT);
 157          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 158          if (reg->file == BAD_FILE)
 159             *reg = *v->emit_sampleid_setup();
 160          break;
 161
 162       case nir_intrinsic_load_sample_mask_in:
 163          assert(v->stage == MESA_SHADER_FRAGMENT);
 164          assert(v->devinfo->gen >= 7);
 165          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 166          if (reg->file == BAD_FILE)
 167             *reg = *v->emit_samplemaskin_setup();
 168          break;
 169
 170       case nir_intrinsic_load_work_group_id:
 171          assert(v->stage == MESA_SHADER_COMPUTE);
 172          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 173          if (reg->file == BAD_FILE)
 174             *reg = *v->emit_cs_work_group_id_setup();
 175          break;
 176
 177       case nir_intrinsic_load_helper_invocation:
 178          assert(v->stage == MESA_SHADER_FRAGMENT);
 179          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
 180          if (reg->file == BAD_FILE) {
 181             const fs_builder abld =
 182                v->bld.annotate("gl_HelperInvocation", NULL);
 183
 184             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
 185              * pixel mask is in g1.7 of the thread payload.
 186              *
 187              * We move the per-channel pixel enable bit to the low bit of each
 188              * channel by shifting the byte containing the pixel mask by the
 189              * vector immediate 0x76543210UV.
 190              *
 191              * The region of <1,8,0> reads only 1 byte (the pixel masks for
 192              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
 193              * masks for 2 and 3) in SIMD16.
 194              */
 195             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
 196
 197             for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
 198                const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
 199                hbld.SHR(offset(shifted, hbld, i),
 200                         stride(retype(brw_vec1_grf(1 + i, 7),
 201                                       BRW_REGISTER_TYPE_UB),
 202                                1, 8, 0),
 203                         brw_imm_v(0x76543210));
 204             }
 205
 206             /* A set bit in the pixel mask means the channel is enabled, but
 207              * that is the opposite of gl_HelperInvocation so we need to invert
 208              * the mask.
 209              *
 210              * The negate source-modifier bit of logical instructions on Gen8+
 211              * performs 1's complement negation, so we can use that instead of
 212              * a NOT instruction.
 213              */
 214             fs_reg inverted = negate(shifted);
 215             if (v->devinfo->gen < 8) {
 216                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
 217                abld.NOT(inverted, shifted);
 218             }
 219
 220             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
 221              * with 1 and negating.
 222              */
 223             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 224             abld.AND(anded, inverted, brw_imm_uw(1));
 225
 226             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
 227             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
 228             *reg = dst;
 229          }
 230          break;
 231
 232       default:
 233          break;
 234       }
 235    }
 236
 237    return true;
 238 }
 239
 240 void
 241 fs_visitor::nir_emit_system_values()
 242 {
 243    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 244    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
 245       nir_system_values[i] = fs_reg();
 246    }
 247
 248    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
 249     * never end up using it.
 250     */
 251    {
 252       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
 253       fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
 254       reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
 255
 256       const fs_builder allbld8 = abld.group(8, 0).exec_all();
 257       allbld8.MOV(reg, brw_imm_v(0x76543210));
 258       if (dispatch_width > 8)
 259          allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
 260       if (dispatch_width > 16) {
 261          const fs_builder allbld16 = abld.group(16, 0).exec_all();
 262          allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
 263       }
 264    }
 265
 266    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
 267    nir_foreach_block(block, impl)
 268       emit_system_values_block(block, this);
 269 }
 270
 271 /*
 272  * Returns a type based on a reference_type (word, float, half-float) and a
 273  * given bit_size.
 274  *
 275  * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
 276  *
 277  * @FIXME: 64-bit return types are always DF on integer types to maintain
 278  * compability with uses of DF previously to the introduction of int64
 279  * support.
 280  */
 281 static brw_reg_type
 282 brw_reg_type_from_bit_size(const unsigned bit_size,
 283                            const brw_reg_type reference_type)
 284 {
 285    switch(reference_type) {
 286    case BRW_REGISTER_TYPE_HF:
 287    case BRW_REGISTER_TYPE_F:
 288    case BRW_REGISTER_TYPE_DF:
 289       switch(bit_size) {
 290       case 16:
 291          return BRW_REGISTER_TYPE_HF;
 292       case 32:
 293          return BRW_REGISTER_TYPE_F;
 294       case 64:
 295          return BRW_REGISTER_TYPE_DF;
 296       default:
 297          unreachable("Invalid bit size");
 298       }
 299    case BRW_REGISTER_TYPE_B:
 300    case BRW_REGISTER_TYPE_W:
 301    case BRW_REGISTER_TYPE_D:
 302    case BRW_REGISTER_TYPE_Q:
 303       switch(bit_size) {
 304       case 8:
 305          return BRW_REGISTER_TYPE_B;
 306       case 16:
 307          return BRW_REGISTER_TYPE_W;
 308       case 32:
 309          return BRW_REGISTER_TYPE_D;
 310       case 64:
 311          return BRW_REGISTER_TYPE_Q;
 312       default:
 313          unreachable("Invalid bit size");
 314       }
 315    case BRW_REGISTER_TYPE_UB:
 316    case BRW_REGISTER_TYPE_UW:
 317    case BRW_REGISTER_TYPE_UD:
 318    case BRW_REGISTER_TYPE_UQ:
 319       switch(bit_size) {
 320       case 8:
 321          return BRW_REGISTER_TYPE_UB;
 322       case 16:
 323          return BRW_REGISTER_TYPE_UW;
 324       case 32:
 325          return BRW_REGISTER_TYPE_UD;
 326       case 64:
 327          return BRW_REGISTER_TYPE_UQ;
 328       default:
 329          unreachable("Invalid bit size");
 330       }
 331    default:
 332       unreachable("Unknown type");
 333    }
 334 }
 335
 336 void
 337 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 338 {
 339    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
 340    for (unsigned i = 0; i < impl->reg_alloc; i++) {
 341       nir_locals[i] = fs_reg();
 342    }
 343
 344    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 345       unsigned array_elems =
 346          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 347       unsigned size = array_elems * reg->num_components;
 348       const brw_reg_type reg_type =
 349          brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
 350       nir_locals[reg->index] = bld.vgrf(reg_type, size);
 351    }
 352
 353    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 354                              impl->ssa_alloc);
 355
 356    nir_emit_cf_list(&impl->body);
 357 }
 358
 359 void
 360 fs_visitor::nir_emit_cf_list(exec_list *list)
 361 {
 362    exec_list_validate(list);
 363    foreach_list_typed(nir_cf_node, node, node, list) {
 364       switch (node->type) {
 365       case nir_cf_node_if:
 366          nir_emit_if(nir_cf_node_as_if(node));
 367          break;
 368
 369       case nir_cf_node_loop:
 370          nir_emit_loop(nir_cf_node_as_loop(node));
 371          break;
 372
 373       case nir_cf_node_block:
 374          nir_emit_block(nir_cf_node_as_block(node));
 375          break;
 376
 377       default:
 378          unreachable("Invalid CFG node block");
 379       }
 380    }
 381 }
 382
 383 void
 384 fs_visitor::nir_emit_if(nir_if *if_stmt)
 385 {
 386    /* first, put the condition into f0 */
 387    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 388                             retype(get_nir_src(if_stmt->condition),
 389                                    BRW_REGISTER_TYPE_D));
 390    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 391
 392    bld.IF(BRW_PREDICATE_NORMAL);
 393
 394    nir_emit_cf_list(&if_stmt->then_list);
 395
 396    /* note: if the else is empty, dead CF elimination will remove it */
 397    bld.emit(BRW_OPCODE_ELSE);
 398
 399    nir_emit_cf_list(&if_stmt->else_list);
 400
 401    bld.emit(BRW_OPCODE_ENDIF);
 402
 403    if (devinfo->gen < 7)
 404       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 405                            "in SIMD32 mode.");
 406 }
 407
 408 void
 409 fs_visitor::nir_emit_loop(nir_loop *loop)
 410 {
 411    bld.emit(BRW_OPCODE_DO);
 412
 413    nir_emit_cf_list(&loop->body);
 414
 415    bld.emit(BRW_OPCODE_WHILE);
 416
 417    if (devinfo->gen < 7)
 418       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 419                            "in SIMD32 mode.");
 420 }
 421
 422 void
 423 fs_visitor::nir_emit_block(nir_block *block)
 424 {
 425    nir_foreach_instr(instr, block) {
 426       nir_emit_instr(instr);
 427    }
 428 }
 429
 430 void
 431 fs_visitor::nir_emit_instr(nir_instr *instr)
 432 {
 433    const fs_builder abld = bld.annotate(NULL, instr);
 434
 435    switch (instr->type) {
 436    case nir_instr_type_alu:
 437       nir_emit_alu(abld, nir_instr_as_alu(instr));
 438       break;
 439
 440    case nir_instr_type_deref:
 441       /* Derefs can exist for images but they do nothing */
 442       break;
 443
 444    case nir_instr_type_intrinsic:
 445       switch (stage) {
 446       case MESA_SHADER_VERTEX:
 447          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 448          break;
 449       case MESA_SHADER_TESS_CTRL:
 450          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 451          break;
 452       case MESA_SHADER_TESS_EVAL:
 453          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
 454          break;
 455       case MESA_SHADER_GEOMETRY:
 456          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 457          break;
 458       case MESA_SHADER_FRAGMENT:
 459          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 460          break;
 461       case MESA_SHADER_COMPUTE:
 462          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 463          break;
 464       default:
 465          unreachable("unsupported shader stage");
 466       }
 467       break;
 468
 469    case nir_instr_type_tex:
 470       nir_emit_texture(abld, nir_instr_as_tex(instr));
 471       break;
 472
 473    case nir_instr_type_load_const:
 474       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 475       break;
 476
 477    case nir_instr_type_ssa_undef:
 478       /* We create a new VGRF for undefs on every use (by handling
 479        * them in get_nir_src()), rather than for each definition.
 480        * This helps register coalescing eliminate MOVs from undef.
 481        */
 482       break;
 483
 484    case nir_instr_type_jump:
 485       nir_emit_jump(abld, nir_instr_as_jump(instr));
 486       break;
 487
 488    default:
 489       unreachable("unknown instruction type");
 490    }
 491 }
 492
 493 /**
 494  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
 495  * match instr.
 496  */
 497 bool
 498 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 499                                       const fs_reg &result)
 500 {
 501    if (!instr->src[0].src.is_ssa ||
 502        !instr->src[0].src.ssa->parent_instr)
 503       return false;
 504
 505    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
 506       return false;
 507
 508    nir_alu_instr *src0 =
 509       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
 510
 511    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
 512        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
 513       return false;
 514
 515    unsigned element = nir_src_as_uint(src0->src[1].src);
 516
 517    /* Element type to extract.*/
 518    const brw_reg_type type = brw_int_type(
 519       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
 520       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
 521
 522    fs_reg op0 = get_nir_src(src0->src[0].src);
 523    op0.type = brw_type_for_nir_type(devinfo,
 524       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
 525                      nir_src_bit_size(src0->src[0].src)));
 526    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 527
 528    set_saturate(instr->dest.saturate,
 529                 bld.MOV(result, subscript(op0, type, element)));
 530    return true;
 531 }
 532
 533 bool
 534 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 535                                          const fs_reg &result)
 536 {
 537    if (!instr->src[0].src.is_ssa ||
 538        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
 539       return false;
 540
 541    nir_intrinsic_instr *src0 =
 542       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 543
 544    if (src0->intrinsic != nir_intrinsic_load_front_face)
 545       return false;
 546
 547    if (!nir_src_is_const(instr->src[1].src) ||
 548        !nir_src_is_const(instr->src[2].src))
 549       return false;
 550
 551    const float value1 = nir_src_as_float(instr->src[1].src);
 552    const float value2 = nir_src_as_float(instr->src[2].src);
 553    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
 554       return false;
 555
 556    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
 557    assert(value1 == -value2);
 558
 559    fs_reg tmp = vgrf(glsl_type::int_type);
 560
 561    if (devinfo->gen >= 6) {
 562       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 563       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 564
 565       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 566        *
 567        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 568        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 569        *
 570        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 571        *
 572        * This negation looks like it's safe in practice, because bits 0:4 will
 573        * surely be TRIANGLES
 574        */
 575
 576       if (value1 == -1.0f) {
 577          g0.negate = true;
 578       }
 579
 580       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
 581              g0, brw_imm_uw(0x3f80));
 582    } else {
 583       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 584       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 585
 586       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 587        *
 588        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 589        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 590        *
 591        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 592        *
 593        * This negation looks like it's safe in practice, because bits 0:4 will
 594        * surely be TRIANGLES
 595        */
 596
 597       if (value1 == -1.0f) {
 598          g1_6.negate = true;
 599       }
 600
 601       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
 602    }
 603    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 604
 605    return true;
 606 }
 607
 608 static void
 609 emit_find_msb_using_lzd(const fs_builder &bld,
 610                         const fs_reg &result,
 611                         const fs_reg &src,
 612                         bool is_signed)
 613 {
 614    fs_inst *inst;
 615    fs_reg temp = src;
 616
 617    if (is_signed) {
 618       /* LZD of an absolute value source almost always does the right
 619        * thing.  There are two problem values:
 620        *
 621        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
 622        *   0.  However, findMSB(int(0x80000000)) == 30.
 623        *
 624        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
 625        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
 626        *
 627        *    For a value of zero or negative one, -1 will be returned.
 628        *
 629        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
 630        *   findMSB(-(1<<x)) should return x-1.
 631        *
 632        * For all negative number cases, including 0x80000000 and
 633        * 0xffffffff, the correct value is obtained from LZD if instead of
 634        * negating the (already negative) value the logical-not is used.  A
 635        * conditonal logical-not can be achieved in two instructions.
 636        */
 637       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
 638
 639       bld.ASR(temp, src, brw_imm_d(31));
 640       bld.XOR(temp, temp, src);
 641    }
 642
 643    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
 644            retype(temp, BRW_REGISTER_TYPE_UD));
 645
 646    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
 647     * from the LSB side. Subtract the result from 31 to convert the MSB
 648     * count into an LSB count.  If no bits are set, LZD will return 32.
 649     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
 650     */
 651    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
 652    inst->src[0].negate = true;
 653 }
 654
 655 static brw_rnd_mode
 656 brw_rnd_mode_from_nir_op (const nir_op op) {
 657    switch (op) {
 658    case nir_op_f2f16_rtz:
 659       return BRW_RND_MODE_RTZ;
 660    case nir_op_f2f16_rtne:
 661       return BRW_RND_MODE_RTNE;
 662    default:
 663       unreachable("Operation doesn't support rounding mode");
 664    }
 665 }
 666
 667 void
 668 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 669 {
 670    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 671    fs_inst *inst;
 672
 673    fs_reg result = get_nir_dest(instr->dest.dest);
 674    result.type = brw_type_for_nir_type(devinfo,
 675       (nir_alu_type)(nir_op_infos[instr->op].output_type |
 676                      nir_dest_bit_size(instr->dest.dest)));
 677
 678    fs_reg op[4];
 679    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 680       op[i] = get_nir_src(instr->src[i].src);
 681       op[i].type = brw_type_for_nir_type(devinfo,
 682          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
 683                         nir_src_bit_size(instr->src[i].src)));
 684       op[i].abs = instr->src[i].abs;
 685       op[i].negate = instr->src[i].negate;
 686    }
 687
 688    /* We get a bunch of mov's out of the from_ssa pass and they may still
 689     * be vectorized.  We'll handle them as a special-case.  We'll also
 690     * handle vecN here because it's basically the same thing.
 691     */
 692    switch (instr->op) {
 693    case nir_op_imov:
 694    case nir_op_fmov:
 695    case nir_op_vec2:
 696    case nir_op_vec3:
 697    case nir_op_vec4: {
 698       fs_reg temp = result;
 699       bool need_extra_copy = false;
 700       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 701          if (!instr->src[i].src.is_ssa &&
 702              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
 703             need_extra_copy = true;
 704             temp = bld.vgrf(result.type, 4);
 705             break;
 706          }
 707       }
 708
 709       for (unsigned i = 0; i < 4; i++) {
 710          if (!(instr->dest.write_mask & (1 << i)))
 711             continue;
 712
 713          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
 714             inst = bld.MOV(offset(temp, bld, i),
 715                            offset(op[0], bld, instr->src[0].swizzle[i]));
 716          } else {
 717             inst = bld.MOV(offset(temp, bld, i),
 718                            offset(op[i], bld, instr->src[i].swizzle[0]));
 719          }
 720          inst->saturate = instr->dest.saturate;
 721       }
 722
 723       /* In this case the source and destination registers were the same,
 724        * so we need to insert an extra set of moves in order to deal with
 725        * any swizzling.
 726        */
 727       if (need_extra_copy) {
 728          for (unsigned i = 0; i < 4; i++) {
 729             if (!(instr->dest.write_mask & (1 << i)))
 730                continue;
 731
 732             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
 733          }
 734       }
 735       return;
 736    }
 737    default:
 738       break;
 739    }
 740
 741    /* At this point, we have dealt with any instruction that operates on
 742     * more than a single channel.  Therefore, we can just adjust the source
 743     * and destination registers for that channel and emit the instruction.
 744     */
 745    unsigned channel = 0;
 746    if (nir_op_infos[instr->op].output_size == 0) {
 747       /* Since NIR is doing the scalarizing for us, we should only ever see
 748        * vectorized operations with a single channel.
 749        */
 750       assert(util_bitcount(instr->dest.write_mask) == 1);
 751       channel = ffs(instr->dest.write_mask) - 1;
 752
 753       result = offset(result, bld, channel);
 754    }
 755
 756    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 757       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 758       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 759    }
 760
 761    switch (instr->op) {
 762    case nir_op_i2f32:
 763    case nir_op_u2f32:
 764       if (optimize_extract_to_float(instr, result))
 765          return;
 766       inst = bld.MOV(result, op[0]);
 767       inst->saturate = instr->dest.saturate;
 768       break;
 769
 770    case nir_op_f2f16_rtne:
 771    case nir_op_f2f16_rtz:
 772       bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
 773                brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
 774       /* fallthrough */
 775
 776       /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
 777        * on the HW gen, it is a special hw opcode or just a MOV, and
 778        * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
 779        *
 780        * But if we want to use that opcode, we need to provide support on
 781        * different optimizations and lowerings. As right now HF support is
 782        * only for gen8+, it will be better to use directly the MOV, and use
 783        * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
 784        */
 785
 786    case nir_op_f2f16:
 787       inst = bld.MOV(result, op[0]);
 788       inst->saturate = instr->dest.saturate;
 789       break;
 790
 791    case nir_op_b2i:
 792    case nir_op_b2f:
 793       op[0].type = BRW_REGISTER_TYPE_D;
 794       op[0].negate = !op[0].negate;
 795       /* fallthrough */
 796    case nir_op_f2f64:
 797    case nir_op_f2i64:
 798    case nir_op_f2u64:
 799    case nir_op_i2f64:
 800    case nir_op_i2i64:
 801    case nir_op_u2f64:
 802    case nir_op_u2u64:
 803       /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
 804        *
 805        *    "When source or destination is 64b (...), regioning in Align1
 806        *     must follow these rules:
 807        *
 808        *     1. Source and destination horizontal stride must be aligned to
 809        *        the same qword.
 810        *     (...)"
 811        *
 812        * This means that conversions from bit-sizes smaller than 64-bit to
 813        * 64-bit need to have the source data elements aligned to 64-bit.
 814        * This restriction does not apply to BDW and later.
 815        */
 816       if (nir_dest_bit_size(instr->dest.dest) == 64 &&
 817           nir_src_bit_size(instr->src[0].src) < 64 &&
 818           (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
 819          fs_reg tmp = bld.vgrf(result.type, 1);
 820          tmp = subscript(tmp, op[0].type, 0);
 821          inst = bld.MOV(tmp, op[0]);
 822          inst = bld.MOV(result, tmp);
 823          inst->saturate = instr->dest.saturate;
 824          break;
 825       }
 826       /* fallthrough */
 827    case nir_op_f2f32:
 828    case nir_op_f2i32:
 829    case nir_op_f2u32:
 830    case nir_op_f2i16:
 831    case nir_op_f2u16:
 832    case nir_op_i2i32:
 833    case nir_op_u2u32:
 834    case nir_op_i2i16:
 835    case nir_op_u2u16:
 836    case nir_op_i2f16:
 837    case nir_op_u2f16:
 838    case nir_op_i2i8:
 839    case nir_op_u2u8:
 840       inst = bld.MOV(result, op[0]);
 841       inst->saturate = instr->dest.saturate;
 842       break;
 843
 844    case nir_op_fsign: {
 845       assert(!instr->dest.saturate);
 846       if (op[0].abs) {
 847          /* Straightforward since the source can be assumed to be either
 848           * strictly >= 0 or strictly <= 0 depending on the setting of the
 849           * negate flag.
 850           */
 851          set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
 852
 853          inst = (op[0].negate)
 854             ? bld.MOV(result, brw_imm_f(-1.0f))
 855             : bld.MOV(result, brw_imm_f(1.0f));
 856
 857          set_predicate(BRW_PREDICATE_NORMAL, inst);
 858       } else if (type_sz(op[0].type) < 8) {
 859          /* AND(val, 0x80000000) gives the sign bit.
 860           *
 861           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 862           * zero.
 863           */
 864          bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 865
 866          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 867          op[0].type = BRW_REGISTER_TYPE_UD;
 868          result.type = BRW_REGISTER_TYPE_UD;
 869          bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
 870
 871          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
 872          inst->predicate = BRW_PREDICATE_NORMAL;
 873       } else {
 874          /* For doubles we do the same but we need to consider:
 875           *
 876           * - 2-src instructions can't operate with 64-bit immediates
 877           * - The sign is encoded in the high 32-bit of each DF
 878           * - We need to produce a DF result.
 879           */
 880
 881          fs_reg zero = vgrf(glsl_type::double_type);
 882          bld.MOV(zero, setup_imm_df(bld, 0.0));
 883          bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
 884
 885          bld.MOV(result, zero);
 886
 887          fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
 888          bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
 889                  brw_imm_ud(0x80000000u));
 890
 891          set_predicate(BRW_PREDICATE_NORMAL,
 892                        bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
 893       }
 894       break;
 895    }
 896
 897    case nir_op_isign: {
 898       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 899        *               -> non-negative val generates 0x00000000.
 900        *  Predicated OR sets 1 if val is positive.
 901        */
 902       uint32_t bit_size = nir_dest_bit_size(instr->dest.dest);
 903       assert(bit_size == 32 || bit_size == 16);
 904
 905       fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0);
 906       fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1);
 907       fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15);
 908
 909       bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G);
 910       bld.ASR(result, op[0], shift);
 911       inst = bld.OR(result, result, one);
 912       inst->predicate = BRW_PREDICATE_NORMAL;
 913       break;
 914    }
 915
 916    case nir_op_frcp:
 917       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
 918       inst->saturate = instr->dest.saturate;
 919       break;
 920
 921    case nir_op_fexp2:
 922       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
 923       inst->saturate = instr->dest.saturate;
 924       break;
 925
 926    case nir_op_flog2:
 927       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
 928       inst->saturate = instr->dest.saturate;
 929       break;
 930
 931    case nir_op_fsin:
 932       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
 933       inst->saturate = instr->dest.saturate;
 934       break;
 935
 936    case nir_op_fcos:
 937       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
 938       inst->saturate = instr->dest.saturate;
 939       break;
 940
 941    case nir_op_fddx:
 942       if (fs_key->high_quality_derivatives) {
 943          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 944       } else {
 945          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 946       }
 947       inst->saturate = instr->dest.saturate;
 948       break;
 949    case nir_op_fddx_fine:
 950       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 951       inst->saturate = instr->dest.saturate;
 952       break;
 953    case nir_op_fddx_coarse:
 954       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 955       inst->saturate = instr->dest.saturate;
 956       break;
 957    case nir_op_fddy:
 958       if (fs_key->high_quality_derivatives) {
 959          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
 960       } else {
 961          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
 962       }
 963       inst->saturate = instr->dest.saturate;
 964       break;
 965    case nir_op_fddy_fine:
 966       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
 967       inst->saturate = instr->dest.saturate;
 968       break;
 969    case nir_op_fddy_coarse:
 970       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
 971       inst->saturate = instr->dest.saturate;
 972       break;
 973
 974    case nir_op_iadd:
 975    case nir_op_fadd:
 976       inst = bld.ADD(result, op[0], op[1]);
 977       inst->saturate = instr->dest.saturate;
 978       break;
 979
 980    case nir_op_fmul:
 981       inst = bld.MUL(result, op[0], op[1]);
 982       inst->saturate = instr->dest.saturate;
 983       break;
 984
 985    case nir_op_imul:
 986       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 987       bld.MUL(result, op[0], op[1]);
 988       break;
 989
 990    case nir_op_imul_high:
 991    case nir_op_umul_high:
 992       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 993       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
 994       break;
 995
 996    case nir_op_idiv:
 997    case nir_op_udiv:
 998       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 999       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1000       break;
1001
1002    case nir_op_uadd_carry:
1003       unreachable("Should have been lowered by carry_to_arith().");
1004
1005    case nir_op_usub_borrow:
1006       unreachable("Should have been lowered by borrow_to_arith().");
1007
1008    case nir_op_umod:
1009    case nir_op_irem:
1010       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1011        * appears that our hardware just does the right thing for signed
1012        * remainder.
1013        */
1014       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1015       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1016       break;
1017
1018    case nir_op_imod: {
1019       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1020       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1021
1022       /* Math instructions don't support conditional mod */
1023       inst = bld.MOV(bld.null_reg_d(), result);
1024       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1025
1026       /* Now, we need to determine if signs of the sources are different.
1027        * When we XOR the sources, the top bit is 0 if they are the same and 1
1028        * if they are different.  We can then use a conditional modifier to
1029        * turn that into a predicate.  This leads us to an XOR.l instruction.
1030        *
1031        * Technically, according to the PRM, you're not allowed to use .l on a
1032        * XOR instruction.  However, emperical experiments and Curro's reading
1033        * of the simulator source both indicate that it's safe.
1034        */
1035       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1036       inst = bld.XOR(tmp, op[0], op[1]);
1037       inst->predicate = BRW_PREDICATE_NORMAL;
1038       inst->conditional_mod = BRW_CONDITIONAL_L;
1039
1040       /* If the result of the initial remainder operation is non-zero and the
1041        * two sources have different signs, add in a copy of op[1] to get the
1042        * final integer modulus value.
1043        */
1044       inst = bld.ADD(result, result, op[1]);
1045       inst->predicate = BRW_PREDICATE_NORMAL;
1046       break;
1047    }
1048
1049    case nir_op_flt:
1050    case nir_op_fge:
1051    case nir_op_feq:
1052    case nir_op_fne: {
1053       fs_reg dest = result;
1054
1055       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1056       if (bit_size != 32)
1057          dest = bld.vgrf(op[0].type, 1);
1058
1059       brw_conditional_mod cond;
1060       switch (instr->op) {
1061       case nir_op_flt:
1062          cond = BRW_CONDITIONAL_L;
1063          break;
1064       case nir_op_fge:
1065          cond = BRW_CONDITIONAL_GE;
1066          break;
1067       case nir_op_feq:
1068          cond = BRW_CONDITIONAL_Z;
1069          break;
1070       case nir_op_fne:
1071          cond = BRW_CONDITIONAL_NZ;
1072          break;
1073       default:
1074          unreachable("bad opcode");
1075       }
1076
1077       bld.CMP(dest, op[0], op[1], cond);
1078
1079       if (bit_size > 32) {
1080          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1081       } else if(bit_size < 32) {
1082          /* When we convert the result to 32-bit we need to be careful and do
1083           * it as a signed conversion to get sign extension (for 32-bit true)
1084           */
1085          const brw_reg_type src_type =
1086             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1087
1088          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1089       }
1090       break;
1091    }
1092
1093    case nir_op_ilt:
1094    case nir_op_ult:
1095    case nir_op_ige:
1096    case nir_op_uge:
1097    case nir_op_ieq:
1098    case nir_op_ine: {
1099       fs_reg dest = result;
1100
1101       const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1102       if (bit_size != 32)
1103          dest = bld.vgrf(op[0].type, 1);
1104
1105       brw_conditional_mod cond;
1106       switch (instr->op) {
1107       case nir_op_ilt:
1108       case nir_op_ult:
1109          cond = BRW_CONDITIONAL_L;
1110          break;
1111       case nir_op_ige:
1112       case nir_op_uge:
1113          cond = BRW_CONDITIONAL_GE;
1114          break;
1115       case nir_op_ieq:
1116          cond = BRW_CONDITIONAL_Z;
1117          break;
1118       case nir_op_ine:
1119          cond = BRW_CONDITIONAL_NZ;
1120          break;
1121       default:
1122          unreachable("bad opcode");
1123       }
1124       bld.CMP(dest, op[0], op[1], cond);
1125
1126       if (bit_size > 32) {
1127          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1128       } else if (bit_size < 32) {
1129          /* When we convert the result to 32-bit we need to be careful and do
1130           * it as a signed conversion to get sign extension (for 32-bit true)
1131           */
1132          const brw_reg_type src_type =
1133             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1134
1135          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1136       }
1137       break;
1138    }
1139
1140    case nir_op_inot:
1141       if (devinfo->gen >= 8) {
1142          op[0] = resolve_source_modifiers(op[0]);
1143       }
1144       bld.NOT(result, op[0]);
1145       break;
1146    case nir_op_ixor:
1147       if (devinfo->gen >= 8) {
1148          op[0] = resolve_source_modifiers(op[0]);
1149          op[1] = resolve_source_modifiers(op[1]);
1150       }
1151       bld.XOR(result, op[0], op[1]);
1152       break;
1153    case nir_op_ior:
1154       if (devinfo->gen >= 8) {
1155          op[0] = resolve_source_modifiers(op[0]);
1156          op[1] = resolve_source_modifiers(op[1]);
1157       }
1158       bld.OR(result, op[0], op[1]);
1159       break;
1160    case nir_op_iand:
1161       if (devinfo->gen >= 8) {
1162          op[0] = resolve_source_modifiers(op[0]);
1163          op[1] = resolve_source_modifiers(op[1]);
1164       }
1165       bld.AND(result, op[0], op[1]);
1166       break;
1167
1168    case nir_op_fdot2:
1169    case nir_op_fdot3:
1170    case nir_op_fdot4:
1171    case nir_op_ball_fequal2:
1172    case nir_op_ball_iequal2:
1173    case nir_op_ball_fequal3:
1174    case nir_op_ball_iequal3:
1175    case nir_op_ball_fequal4:
1176    case nir_op_ball_iequal4:
1177    case nir_op_bany_fnequal2:
1178    case nir_op_bany_inequal2:
1179    case nir_op_bany_fnequal3:
1180    case nir_op_bany_inequal3:
1181    case nir_op_bany_fnequal4:
1182    case nir_op_bany_inequal4:
1183       unreachable("Lowered by nir_lower_alu_reductions");
1184
1185    case nir_op_fnoise1_1:
1186    case nir_op_fnoise1_2:
1187    case nir_op_fnoise1_3:
1188    case nir_op_fnoise1_4:
1189    case nir_op_fnoise2_1:
1190    case nir_op_fnoise2_2:
1191    case nir_op_fnoise2_3:
1192    case nir_op_fnoise2_4:
1193    case nir_op_fnoise3_1:
1194    case nir_op_fnoise3_2:
1195    case nir_op_fnoise3_3:
1196    case nir_op_fnoise3_4:
1197    case nir_op_fnoise4_1:
1198    case nir_op_fnoise4_2:
1199    case nir_op_fnoise4_3:
1200    case nir_op_fnoise4_4:
1201       unreachable("not reached: should be handled by lower_noise");
1202
1203    case nir_op_ldexp:
1204       unreachable("not reached: should be handled by ldexp_to_arith()");
1205
1206    case nir_op_fsqrt:
1207       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1208       inst->saturate = instr->dest.saturate;
1209       break;
1210
1211    case nir_op_frsq:
1212       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1213       inst->saturate = instr->dest.saturate;
1214       break;
1215
1216    case nir_op_i2b:
1217    case nir_op_f2b: {
1218       uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1219       if (bit_size == 64) {
1220          /* two-argument instructions can't take 64-bit immediates */
1221          fs_reg zero;
1222          fs_reg tmp;
1223
1224          if (instr->op == nir_op_f2b) {
1225             zero = vgrf(glsl_type::double_type);
1226             tmp = vgrf(glsl_type::double_type);
1227             bld.MOV(zero, setup_imm_df(bld, 0.0));
1228          } else {
1229             zero = vgrf(glsl_type::int64_t_type);
1230             tmp = vgrf(glsl_type::int64_t_type);
1231             bld.MOV(zero, brw_imm_q(0));
1232          }
1233
1234          /* A SIMD16 execution needs to be split in two instructions, so use
1235           * a vgrf instead of the flag register as dst so instruction splitting
1236           * works
1237           */
1238          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1239          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1240       } else {
1241          fs_reg zero;
1242          if (bit_size == 32) {
1243             zero = instr->op == nir_op_f2b ? brw_imm_f(0.0f) : brw_imm_d(0);
1244          } else {
1245             assert(bit_size == 16);
1246             zero = instr->op == nir_op_f2b ?
1247                retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1248          }
1249          bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1250       }
1251       break;
1252    }
1253
1254    case nir_op_ftrunc:
1255       inst = bld.RNDZ(result, op[0]);
1256       inst->saturate = instr->dest.saturate;
1257       break;
1258
1259    case nir_op_fceil: {
1260       op[0].negate = !op[0].negate;
1261       fs_reg temp = vgrf(glsl_type::float_type);
1262       bld.RNDD(temp, op[0]);
1263       temp.negate = true;
1264       inst = bld.MOV(result, temp);
1265       inst->saturate = instr->dest.saturate;
1266       break;
1267    }
1268    case nir_op_ffloor:
1269       inst = bld.RNDD(result, op[0]);
1270       inst->saturate = instr->dest.saturate;
1271       break;
1272    case nir_op_ffract:
1273       inst = bld.FRC(result, op[0]);
1274       inst->saturate = instr->dest.saturate;
1275       break;
1276    case nir_op_fround_even:
1277       inst = bld.RNDE(result, op[0]);
1278       inst->saturate = instr->dest.saturate;
1279       break;
1280
1281    case nir_op_fquantize2f16: {
1282       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1283       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1284       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1285
1286       /* The destination stride must be at least as big as the source stride. */
1287       tmp16.type = BRW_REGISTER_TYPE_W;
1288       tmp16.stride = 2;
1289
1290       /* Check for denormal */
1291       fs_reg abs_src0 = op[0];
1292       abs_src0.abs = true;
1293       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1294               BRW_CONDITIONAL_L);
1295       /* Get the appropriately signed zero */
1296       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1297               retype(op[0], BRW_REGISTER_TYPE_UD),
1298               brw_imm_ud(0x80000000));
1299       /* Do the actual F32 -> F16 -> F32 conversion */
1300       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1301       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1302       /* Select that or zero based on normal status */
1303       inst = bld.SEL(result, zero, tmp32);
1304       inst->predicate = BRW_PREDICATE_NORMAL;
1305       inst->saturate = instr->dest.saturate;
1306       break;
1307    }
1308
1309    case nir_op_imin:
1310    case nir_op_umin:
1311    case nir_op_fmin:
1312       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1313       inst->saturate = instr->dest.saturate;
1314       break;
1315
1316    case nir_op_imax:
1317    case nir_op_umax:
1318    case nir_op_fmax:
1319       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1320       inst->saturate = instr->dest.saturate;
1321       break;
1322
1323    case nir_op_pack_snorm_2x16:
1324    case nir_op_pack_snorm_4x8:
1325    case nir_op_pack_unorm_2x16:
1326    case nir_op_pack_unorm_4x8:
1327    case nir_op_unpack_snorm_2x16:
1328    case nir_op_unpack_snorm_4x8:
1329    case nir_op_unpack_unorm_2x16:
1330    case nir_op_unpack_unorm_4x8:
1331    case nir_op_unpack_half_2x16:
1332    case nir_op_pack_half_2x16:
1333       unreachable("not reached: should be handled by lower_packing_builtins");
1334
1335    case nir_op_unpack_half_2x16_split_x:
1336       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
1337       inst->saturate = instr->dest.saturate;
1338       break;
1339    case nir_op_unpack_half_2x16_split_y:
1340       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
1341       inst->saturate = instr->dest.saturate;
1342       break;
1343
1344    case nir_op_pack_64_2x32_split:
1345    case nir_op_pack_32_2x16_split:
1346       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1347       break;
1348
1349    case nir_op_unpack_64_2x32_split_x:
1350    case nir_op_unpack_64_2x32_split_y: {
1351       if (instr->op == nir_op_unpack_64_2x32_split_x)
1352          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1353       else
1354          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1355       break;
1356    }
1357
1358    case nir_op_unpack_32_2x16_split_x:
1359    case nir_op_unpack_32_2x16_split_y: {
1360       if (instr->op == nir_op_unpack_32_2x16_split_x)
1361          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1362       else
1363          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1364       break;
1365    }
1366
1367    case nir_op_fpow:
1368       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1369       inst->saturate = instr->dest.saturate;
1370       break;
1371
1372    case nir_op_bitfield_reverse:
1373       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1374       bld.BFREV(result, op[0]);
1375       break;
1376
1377    case nir_op_bit_count:
1378       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1379       bld.CBIT(result, op[0]);
1380       break;
1381
1382    case nir_op_ufind_msb: {
1383       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1384       emit_find_msb_using_lzd(bld, result, op[0], false);
1385       break;
1386    }
1387
1388    case nir_op_ifind_msb: {
1389       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1390
1391       if (devinfo->gen < 7) {
1392          emit_find_msb_using_lzd(bld, result, op[0], true);
1393       } else {
1394          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1395
1396          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1397           * count from the LSB side. If FBH didn't return an error
1398           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1399           * count into an LSB count.
1400           */
1401          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1402
1403          inst = bld.ADD(result, result, brw_imm_d(31));
1404          inst->predicate = BRW_PREDICATE_NORMAL;
1405          inst->src[0].negate = true;
1406       }
1407       break;
1408    }
1409
1410    case nir_op_find_lsb:
1411       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1412
1413       if (devinfo->gen < 7) {
1414          fs_reg temp = vgrf(glsl_type::int_type);
1415
1416          /* (x & -x) generates a value that consists of only the LSB of x.
1417           * For all powers of 2, findMSB(y) == findLSB(y).
1418           */
1419          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1420          fs_reg negated_src = src;
1421
1422          /* One must be negated, and the other must be non-negated.  It
1423           * doesn't matter which is which.
1424           */
1425          negated_src.negate = true;
1426          src.negate = false;
1427
1428          bld.AND(temp, src, negated_src);
1429          emit_find_msb_using_lzd(bld, result, temp, false);
1430       } else {
1431          bld.FBL(result, op[0]);
1432       }
1433       break;
1434
1435    case nir_op_ubitfield_extract:
1436    case nir_op_ibitfield_extract:
1437       unreachable("should have been lowered");
1438    case nir_op_ubfe:
1439    case nir_op_ibfe:
1440       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1441       bld.BFE(result, op[2], op[1], op[0]);
1442       break;
1443    case nir_op_bfm:
1444       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1445       bld.BFI1(result, op[0], op[1]);
1446       break;
1447    case nir_op_bfi:
1448       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1449       bld.BFI2(result, op[0], op[1], op[2]);
1450       break;
1451
1452    case nir_op_bitfield_insert:
1453       unreachable("not reached: should have been lowered");
1454
1455    case nir_op_ishl:
1456    case nir_op_ishr:
1457    case nir_op_ushr: {
1458       fs_reg shift_count = op[1];
1459
1460       if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) {
1461          if (op[1].file == VGRF &&
1462              (result.type == BRW_REGISTER_TYPE_Q ||
1463               result.type == BRW_REGISTER_TYPE_UQ)) {
1464             shift_count = fs_reg(VGRF, alloc.allocate(dispatch_width / 4),
1465                                  BRW_REGISTER_TYPE_UD);
1466             shift_count.stride = 2;
1467             bld.MOV(shift_count, op[1]);
1468          }
1469       }
1470
1471       switch (instr->op) {
1472       case nir_op_ishl:
1473          bld.SHL(result, op[0], shift_count);
1474          break;
1475       case nir_op_ishr:
1476          bld.ASR(result, op[0], shift_count);
1477          break;
1478       case nir_op_ushr:
1479          bld.SHR(result, op[0], shift_count);
1480          break;
1481       default:
1482          unreachable("not reached");
1483       }
1484       break;
1485    }
1486
1487    case nir_op_pack_half_2x16_split:
1488       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1489       break;
1490
1491    case nir_op_ffma:
1492       inst = bld.MAD(result, op[2], op[1], op[0]);
1493       inst->saturate = instr->dest.saturate;
1494       break;
1495
1496    case nir_op_flrp:
1497       inst = bld.LRP(result, op[0], op[1], op[2]);
1498       inst->saturate = instr->dest.saturate;
1499       break;
1500
1501    case nir_op_bcsel:
1502       if (optimize_frontfacing_ternary(instr, result))
1503          return;
1504
1505       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1506       inst = bld.SEL(result, op[1], op[2]);
1507       inst->predicate = BRW_PREDICATE_NORMAL;
1508       break;
1509
1510    case nir_op_extract_u8:
1511    case nir_op_extract_i8: {
1512       unsigned byte = nir_src_as_uint(instr->src[1].src);
1513
1514       /* The PRMs say:
1515        *
1516        *    BDW+
1517        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1518        *    Use two instructions and a word or DWord intermediate integer type.
1519        */
1520       if (nir_dest_bit_size(instr->dest.dest) == 64) {
1521          const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
1522
1523          if (instr->op == nir_op_extract_i8) {
1524             /* If we need to sign extend, extract to a word first */
1525             fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1526             bld.MOV(w_temp, subscript(op[0], type, byte));
1527             bld.MOV(result, w_temp);
1528          } else {
1529             /* Otherwise use an AND with 0xff and a word type */
1530             bld.AND(result, subscript(op[0], type, byte / 2), brw_imm_uw(0xff));
1531          }
1532       } else {
1533          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1534          bld.MOV(result, subscript(op[0], type, byte));
1535       }
1536       break;
1537    }
1538
1539    case nir_op_extract_u16:
1540    case nir_op_extract_i16: {
1541       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1542       unsigned word = nir_src_as_uint(instr->src[1].src);
1543       bld.MOV(result, subscript(op[0], type, word));
1544       break;
1545    }
1546
1547    default:
1548       unreachable("unhandled instruction");
1549    }
1550
1551    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1552     * to sign extend the low bit to 0/~0
1553     */
1554    if (devinfo->gen <= 5 &&
1555        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1556       fs_reg masked = vgrf(glsl_type::int_type);
1557       bld.AND(masked, result, brw_imm_d(1));
1558       masked.negate = true;
1559       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1560    }
1561 }
1562
1563 void
1564 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1565                                 nir_load_const_instr *instr)
1566 {
1567    const brw_reg_type reg_type =
1568       brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1569    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1570
1571    switch (instr->def.bit_size) {
1572    case 8:
1573       for (unsigned i = 0; i < instr->def.num_components; i++)
1574          bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value.i8[i]));
1575       break;
1576
1577    case 16:
1578       for (unsigned i = 0; i < instr->def.num_components; i++)
1579          bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i]));
1580       break;
1581
1582    case 32:
1583       for (unsigned i = 0; i < instr->def.num_components; i++)
1584          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
1585       break;
1586
1587    case 64:
1588       assert(devinfo->gen >= 7);
1589       if (devinfo->gen == 7) {
1590          /* We don't get 64-bit integer types until gen8 */
1591          for (unsigned i = 0; i < instr->def.num_components; i++) {
1592             bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1593                     setup_imm_df(bld, instr->value.f64[i]));
1594          }
1595       } else {
1596          for (unsigned i = 0; i < instr->def.num_components; i++)
1597             bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value.i64[i]));
1598       }
1599       break;
1600
1601    default:
1602       unreachable("Invalid bit size");
1603    }
1604
1605    nir_ssa_values[instr->def.index] = reg;
1606 }
1607
1608 fs_reg
1609 fs_visitor::get_nir_src(const nir_src &src)
1610 {
1611    fs_reg reg;
1612    if (src.is_ssa) {
1613       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1614          const brw_reg_type reg_type =
1615             brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
1616          reg = bld.vgrf(reg_type, src.ssa->num_components);
1617       } else {
1618          reg = nir_ssa_values[src.ssa->index];
1619       }
1620    } else {
1621       /* We don't handle indirects on locals */
1622       assert(src.reg.indirect == NULL);
1623       reg = offset(nir_locals[src.reg.reg->index], bld,
1624                    src.reg.base_offset * src.reg.reg->num_components);
1625    }
1626
1627    if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
1628       /* The only 64-bit type available on gen7 is DF, so use that. */
1629       reg.type = BRW_REGISTER_TYPE_DF;
1630    } else {
1631       /* To avoid floating-point denorm flushing problems, set the type by
1632        * default to an integer type - instructions that need floating point
1633        * semantics will set this to F if they need to
1634        */
1635       reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
1636                                             BRW_REGISTER_TYPE_D);
1637    }
1638
1639    return reg;
1640 }
1641
1642 /**
1643  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1644  *
1645  * This function should not be called on any value which may be 64 bits.
1646  * We could theoretically support 64-bit on gen8+ but we choose not to
1647  * because it wouldn't work in general (no gen7 support) and there are
1648  * enough restrictions in 64-bit immediates that you can't take the return
1649  * value and treat it the same as the result of get_nir_src().
1650  */
1651 fs_reg
1652 fs_visitor::get_nir_src_imm(const nir_src &src)
1653 {
1654    assert(nir_src_bit_size(src) == 32);
1655    return nir_src_is_const(src) ?
1656           fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
1657 }
1658
1659 fs_reg
1660 fs_visitor::get_nir_dest(const nir_dest &dest)
1661 {
1662    if (dest.is_ssa) {
1663       const brw_reg_type reg_type =
1664          brw_reg_type_from_bit_size(dest.ssa.bit_size,
1665                                     dest.ssa.bit_size == 8 ?
1666                                     BRW_REGISTER_TYPE_D :
1667                                     BRW_REGISTER_TYPE_F);
1668       nir_ssa_values[dest.ssa.index] =
1669          bld.vgrf(reg_type, dest.ssa.num_components);
1670       return nir_ssa_values[dest.ssa.index];
1671    } else {
1672       /* We don't handle indirects on locals */
1673       assert(dest.reg.indirect == NULL);
1674       return offset(nir_locals[dest.reg.reg->index], bld,
1675                     dest.reg.base_offset * dest.reg.reg->num_components);
1676    }
1677 }
1678
1679 void
1680 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1681                          unsigned wr_mask)
1682 {
1683    for (unsigned i = 0; i < 4; i++) {
1684       if (!((wr_mask >> i) & 1))
1685          continue;
1686
1687       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1688       new_inst->dst = offset(new_inst->dst, bld, i);
1689       for (unsigned j = 0; j < new_inst->sources; j++)
1690          if (new_inst->src[j].file == VGRF)
1691             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1692
1693       bld.emit(new_inst);
1694    }
1695 }
1696
1697 static fs_inst *
1698 emit_pixel_interpolater_send(const fs_builder &bld,
1699                              enum opcode opcode,
1700                              const fs_reg &dst,
1701                              const fs_reg &src,
1702                              const fs_reg &desc,
1703                              glsl_interp_mode interpolation)
1704 {
1705    struct brw_wm_prog_data *wm_prog_data =
1706       brw_wm_prog_data(bld.shader->stage_prog_data);
1707
1708    fs_inst *inst = bld.emit(opcode, dst, src, desc);
1709    /* 2 floats per slot returned */
1710    inst->size_written = 2 * dst.component_size(inst->exec_size);
1711    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1712
1713    wm_prog_data->pulls_bary = true;
1714
1715    return inst;
1716 }
1717
1718 /**
1719  * Computes 1 << x, given a D/UD register containing some value x.
1720  */
1721 static fs_reg
1722 intexp2(const fs_builder &bld, const fs_reg &x)
1723 {
1724    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1725
1726    fs_reg result = bld.vgrf(x.type, 1);
1727    fs_reg one = bld.vgrf(x.type, 1);
1728
1729    bld.MOV(one, retype(brw_imm_d(1), one.type));
1730    bld.SHL(result, one, x);
1731    return result;
1732 }
1733
1734 void
1735 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1736 {
1737    assert(stage == MESA_SHADER_GEOMETRY);
1738
1739    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1740
1741    if (gs_compile->control_data_header_size_bits == 0)
1742       return;
1743
1744    /* We can only do EndPrimitive() functionality when the control data
1745     * consists of cut bits.  Fortunately, the only time it isn't is when the
1746     * output type is points, in which case EndPrimitive() is a no-op.
1747     */
1748    if (gs_prog_data->control_data_format !=
1749        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1750       return;
1751    }
1752
1753    /* Cut bits use one bit per vertex. */
1754    assert(gs_compile->control_data_bits_per_vertex == 1);
1755
1756    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1757    vertex_count.type = BRW_REGISTER_TYPE_UD;
1758
1759    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1760     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1761     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1762     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1763     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1764     *
1765     * Note that if EndPrimitive() is called before emitting any vertices, this
1766     * will cause us to set bit 31 of the control_data_bits register to 1.
1767     * That's fine because:
1768     *
1769     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1770     *   output, so the hardware will ignore cut bit 31.
1771     *
1772     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1773     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1774     *   is automatically ended when the GS terminates).
1775     *
1776     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1777     *   control_data_bits register to 0 when the first vertex is emitted.
1778     */
1779
1780    const fs_builder abld = bld.annotate("end primitive");
1781
1782    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1783    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1784    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1785    fs_reg mask = intexp2(abld, prev_count);
1786    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1787     * attention to the lower 5 bits of its second source argument, so on this
1788     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1789     * ((vertex_count - 1) % 32).
1790     */
1791    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1792 }
1793
1794 void
1795 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1796 {
1797    assert(stage == MESA_SHADER_GEOMETRY);
1798    assert(gs_compile->control_data_bits_per_vertex != 0);
1799
1800    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1801
1802    const fs_builder abld = bld.annotate("emit control data bits");
1803    const fs_builder fwa_bld = bld.exec_all();
1804
1805    /* We use a single UD register to accumulate control data bits (32 bits
1806     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1807     * at a time.
1808     *
1809     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1810     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1811     * use the Channel Mask phase to enable/disable which DWord within that
1812     * group to write.  (Remember, different SIMD8 channels may have emitted
1813     * different numbers of vertices, so we may need per-slot offsets.)
1814     *
1815     * Channel masking presents an annoying problem: we may have to replicate
1816     * the data up to 4 times:
1817     *
1818     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1819     *
1820     * To avoid penalizing shaders that emit a small number of vertices, we
1821     * can avoid these sometimes: if the size of the control data header is
1822     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1823     * land in the same 128-bit group, so we can skip per-slot offsets.
1824     *
1825     * Similarly, if the control data header is <= 32 bits, there is only one
1826     * DWord, so we can skip channel masks.
1827     */
1828    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1829
1830    fs_reg channel_mask, per_slot_offset;
1831
1832    if (gs_compile->control_data_header_size_bits > 32) {
1833       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1834       channel_mask = vgrf(glsl_type::uint_type);
1835    }
1836
1837    if (gs_compile->control_data_header_size_bits > 128) {
1838       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1839       per_slot_offset = vgrf(glsl_type::uint_type);
1840    }
1841
1842    /* Figure out which DWord we're trying to write to using the formula:
1843     *
1844     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1845     *
1846     * Since bits_per_vertex is a power of two, and is known at compile
1847     * time, this can be optimized to:
1848     *
1849     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1850     */
1851    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1852       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1853       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1854       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1855       unsigned log2_bits_per_vertex =
1856          util_last_bit(gs_compile->control_data_bits_per_vertex);
1857       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1858
1859       if (per_slot_offset.file != BAD_FILE) {
1860          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1861           * the appropriate OWord within the control data header.
1862           */
1863          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1864       }
1865
1866       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1867        * write to the appropriate DWORD within the OWORD.
1868        */
1869       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1870       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1871       channel_mask = intexp2(fwa_bld, channel);
1872       /* Then the channel masks need to be in bits 23:16. */
1873       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1874    }
1875
1876    /* Store the control data bits in the message payload and send it. */
1877    int mlen = 2;
1878    if (channel_mask.file != BAD_FILE)
1879       mlen += 4; /* channel masks, plus 3 extra copies of the data */
1880    if (per_slot_offset.file != BAD_FILE)
1881       mlen++;
1882
1883    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1884    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1885    int i = 0;
1886    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1887    if (per_slot_offset.file != BAD_FILE)
1888       sources[i++] = per_slot_offset;
1889    if (channel_mask.file != BAD_FILE)
1890       sources[i++] = channel_mask;
1891    while (i < mlen) {
1892       sources[i++] = this->control_data_bits;
1893    }
1894
1895    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1896    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1897    inst->mlen = mlen;
1898    /* We need to increment Global Offset by 256-bits to make room for
1899     * Broadwell's extra "Vertex Count" payload at the beginning of the
1900     * URB entry.  Since this is an OWord message, Global Offset is counted
1901     * in 128-bit units, so we must set it to 2.
1902     */
1903    if (gs_prog_data->static_vertex_count == -1)
1904       inst->offset = 2;
1905 }
1906
1907 void
1908 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1909                                             unsigned stream_id)
1910 {
1911    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1912
1913    /* Note: we are calling this *before* increasing vertex_count, so
1914     * this->vertex_count == vertex_count - 1 in the formula above.
1915     */
1916
1917    /* Stream mode uses 2 bits per vertex */
1918    assert(gs_compile->control_data_bits_per_vertex == 2);
1919
1920    /* Must be a valid stream */
1921    assert(stream_id < MAX_VERTEX_STREAMS);
1922
1923    /* Control data bits are initialized to 0 so we don't have to set any
1924     * bits when sending vertices to stream 0.
1925     */
1926    if (stream_id == 0)
1927       return;
1928
1929    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1930
1931    /* reg::sid = stream_id */
1932    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1933    abld.MOV(sid, brw_imm_ud(stream_id));
1934
1935    /* reg:shift_count = 2 * (vertex_count - 1) */
1936    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1937    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
1938
1939    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1940     * attention to the lower 5 bits of its second source argument, so on this
1941     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1942     * stream_id << ((2 * (vertex_count - 1)) % 32).
1943     */
1944    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1945    abld.SHL(mask, sid, shift_count);
1946    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1947 }
1948
1949 void
1950 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1951                            unsigned stream_id)
1952 {
1953    assert(stage == MESA_SHADER_GEOMETRY);
1954
1955    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1956
1957    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1958    vertex_count.type = BRW_REGISTER_TYPE_UD;
1959
1960    /* Haswell and later hardware ignores the "Render Stream Select" bits
1961     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1962     * and instead sends all primitives down the pipeline for rasterization.
1963     * If the SOL stage is enabled, "Render Stream Select" is honored and
1964     * primitives bound to non-zero streams are discarded after stream output.
1965     *
1966     * Since the only purpose of primives sent to non-zero streams is to
1967     * be recorded by transform feedback, we can simply discard all geometry
1968     * bound to these streams when transform feedback is disabled.
1969     */
1970    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
1971       return;
1972
1973    /* If we're outputting 32 control data bits or less, then we can wait
1974     * until the shader is over to output them all.  Otherwise we need to
1975     * output them as we go.  Now is the time to do it, since we're about to
1976     * output the vertex_count'th vertex, so it's guaranteed that the
1977     * control data bits associated with the (vertex_count - 1)th vertex are
1978     * correct.
1979     */
1980    if (gs_compile->control_data_header_size_bits > 32) {
1981       const fs_builder abld =
1982          bld.annotate("emit vertex: emit control data bits");
1983
1984       /* Only emit control data bits if we've finished accumulating a batch
1985        * of 32 bits.  This is the case when:
1986        *
1987        *     (vertex_count * bits_per_vertex) % 32 == 0
1988        *
1989        * (in other words, when the last 5 bits of vertex_count *
1990        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
1991        * integer n (which is always the case, since bits_per_vertex is
1992        * always 1 or 2), this is equivalent to requiring that the last 5-n
1993        * bits of vertex_count are 0:
1994        *
1995        *     vertex_count & (2^(5-n) - 1) == 0
1996        *
1997        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
1998        * equivalent to:
1999        *
2000        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2001        *
2002        * TODO: If vertex_count is an immediate, we could do some of this math
2003        *       at compile time...
2004        */
2005       fs_inst *inst =
2006          abld.AND(bld.null_reg_d(), vertex_count,
2007                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2008       inst->conditional_mod = BRW_CONDITIONAL_Z;
2009
2010       abld.IF(BRW_PREDICATE_NORMAL);
2011       /* If vertex_count is 0, then no control data bits have been
2012        * accumulated yet, so we can skip emitting them.
2013        */
2014       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2015                BRW_CONDITIONAL_NEQ);
2016       abld.IF(BRW_PREDICATE_NORMAL);
2017       emit_gs_control_data_bits(vertex_count);
2018       abld.emit(BRW_OPCODE_ENDIF);
2019
2020       /* Reset control_data_bits to 0 so we can start accumulating a new
2021        * batch.
2022        *
2023        * Note: in the case where vertex_count == 0, this neutralizes the
2024        * effect of any call to EndPrimitive() that the shader may have
2025        * made before outputting its first vertex.
2026        */
2027       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2028       inst->force_writemask_all = true;
2029       abld.emit(BRW_OPCODE_ENDIF);
2030    }
2031
2032    emit_urb_writes(vertex_count);
2033
2034    /* In stream mode we have to set control data bits for all vertices
2035     * unless we have disabled control data bits completely (which we do
2036     * do for GL_POINTS outputs that don't use streams).
2037     */
2038    if (gs_compile->control_data_header_size_bits > 0 &&
2039        gs_prog_data->control_data_format ==
2040           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2041       set_gs_stream_control_data_bits(vertex_count, stream_id);
2042    }
2043 }
2044
2045 void
2046 fs_visitor::emit_gs_input_load(const fs_reg &dst,
2047                                const nir_src &vertex_src,
2048                                unsigned base_offset,
2049                                const nir_src &offset_src,
2050                                unsigned num_components,
2051                                unsigned first_component)
2052 {
2053    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2054    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2055
2056    /* TODO: figure out push input layout for invocations == 1 */
2057    /* TODO: make this work with 64-bit inputs */
2058    if (gs_prog_data->invocations == 1 &&
2059        type_sz(dst.type) <= 4 &&
2060        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2061        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2062       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2063                        nir_src_as_uint(vertex_src) * push_reg_count;
2064       for (unsigned i = 0; i < num_components; i++) {
2065          bld.MOV(offset(dst, bld, i),
2066                  fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2067       }
2068       return;
2069    }
2070
2071    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2072    assert(gs_prog_data->base.include_vue_handles);
2073
2074    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2075    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2076
2077    if (gs_prog_data->invocations == 1) {
2078       if (nir_src_is_const(vertex_src)) {
2079          /* The vertex index is constant; just select the proper URB handle. */
2080          icp_handle =
2081             retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2082                    BRW_REGISTER_TYPE_UD);
2083       } else {
2084          /* The vertex index is non-constant.  We need to use indirect
2085           * addressing to fetch the proper URB handle.
2086           *
2087           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2088           * indicating that channel <n> should read the handle from
2089           * DWord <n>.  We convert that to bytes by multiplying by 4.
2090           *
2091           * Next, we convert the vertex index to bytes by multiplying
2092           * by 32 (shifting by 5), and add the two together.  This is
2093           * the final indirect byte offset.
2094           */
2095          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2096          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2097          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2098          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2099
2100          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2101          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2102          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2103          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2104          /* Convert vertex_index to bytes (multiply by 32) */
2105          bld.SHL(vertex_offset_bytes,
2106                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2107                  brw_imm_ud(5u));
2108          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2109
2110          /* Use first_icp_handle as the base offset.  There is one register
2111           * of URB handles per vertex, so inform the register allocator that
2112           * we might read up to nir->info.gs.vertices_in registers.
2113           */
2114          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2115                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2116                   fs_reg(icp_offset_bytes),
2117                   brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2118       }
2119    } else {
2120       assert(gs_prog_data->invocations > 1);
2121
2122       if (nir_src_is_const(vertex_src)) {
2123          unsigned vertex = nir_src_as_uint(vertex_src);
2124          assert(devinfo->gen >= 9 || vertex <= 5);
2125          bld.MOV(icp_handle,
2126                  retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2127                         BRW_REGISTER_TYPE_UD));
2128       } else {
2129          /* The vertex index is non-constant.  We need to use indirect
2130           * addressing to fetch the proper URB handle.
2131           *
2132           */
2133          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2134
2135          /* Convert vertex_index to bytes (multiply by 4) */
2136          bld.SHL(icp_offset_bytes,
2137                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2138                  brw_imm_ud(2u));
2139
2140          /* Use first_icp_handle as the base offset.  There is one DWord
2141           * of URB handles per vertex, so inform the register allocator that
2142           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2143           */
2144          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2145                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2146                   fs_reg(icp_offset_bytes),
2147                   brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2148                              REG_SIZE));
2149       }
2150    }
2151
2152    fs_inst *inst;
2153
2154    fs_reg tmp_dst = dst;
2155    fs_reg indirect_offset = get_nir_src(offset_src);
2156    unsigned num_iterations = 1;
2157    unsigned orig_num_components = num_components;
2158
2159    if (type_sz(dst.type) == 8) {
2160       if (num_components > 2) {
2161          num_iterations = 2;
2162          num_components = 2;
2163       }
2164       fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2165       tmp_dst = tmp;
2166       first_component = first_component / 2;
2167    }
2168
2169    for (unsigned iter = 0; iter < num_iterations; iter++) {
2170       if (nir_src_is_const(offset_src)) {
2171          /* Constant indexing - use global offset. */
2172          if (first_component != 0) {
2173             unsigned read_components = num_components + first_component;
2174             fs_reg tmp = bld.vgrf(dst.type, read_components);
2175             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2176             inst->size_written = read_components *
2177                                  tmp.component_size(inst->exec_size);
2178             for (unsigned i = 0; i < num_components; i++) {
2179                bld.MOV(offset(tmp_dst, bld, i),
2180                        offset(tmp, bld, i + first_component));
2181             }
2182          } else {
2183             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2184                             icp_handle);
2185             inst->size_written = num_components *
2186                                  tmp_dst.component_size(inst->exec_size);
2187          }
2188          inst->offset = base_offset + nir_src_as_uint(offset_src);
2189          inst->mlen = 1;
2190       } else {
2191          /* Indirect indexing - use per-slot offsets as well. */
2192          const fs_reg srcs[] = { icp_handle, indirect_offset };
2193          unsigned read_components = num_components + first_component;
2194          fs_reg tmp = bld.vgrf(dst.type, read_components);
2195          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2196          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2197          if (first_component != 0) {
2198             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2199                             payload);
2200             inst->size_written = read_components *
2201                                  tmp.component_size(inst->exec_size);
2202             for (unsigned i = 0; i < num_components; i++) {
2203                bld.MOV(offset(tmp_dst, bld, i),
2204                        offset(tmp, bld, i + first_component));
2205             }
2206          } else {
2207             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2208                          payload);
2209             inst->size_written = num_components *
2210                                  tmp_dst.component_size(inst->exec_size);
2211          }
2212          inst->offset = base_offset;
2213          inst->mlen = 2;
2214       }
2215
2216       if (type_sz(dst.type) == 8) {
2217          shuffle_from_32bit_read(bld,
2218                                  offset(dst, bld, iter * 2),
2219                                  retype(tmp_dst, BRW_REGISTER_TYPE_D),
2220                                  0,
2221                                  num_components);
2222       }
2223
2224       if (num_iterations > 1) {
2225          num_components = orig_num_components - 2;
2226          if(nir_src_is_const(offset_src)) {
2227             base_offset++;
2228          } else {
2229             fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2230             bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2231             indirect_offset = new_indirect;
2232          }
2233       }
2234    }
2235 }
2236
2237 fs_reg
2238 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2239 {
2240    nir_src *offset_src = nir_get_io_offset_src(instr);
2241
2242    if (nir_src_is_const(*offset_src)) {
2243       /* The only constant offset we should find is 0.  brw_nir.c's
2244        * add_const_offset_to_base() will fold other constant offsets
2245        * into instr->const_index[0].
2246        */
2247       assert(nir_src_as_uint(*offset_src) == 0);
2248       return fs_reg();
2249    }
2250
2251    return get_nir_src(*offset_src);
2252 }
2253
2254 void
2255 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2256                                   nir_intrinsic_instr *instr)
2257 {
2258    assert(stage == MESA_SHADER_VERTEX);
2259
2260    fs_reg dest;
2261    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2262       dest = get_nir_dest(instr->dest);
2263
2264    switch (instr->intrinsic) {
2265    case nir_intrinsic_load_vertex_id:
2266    case nir_intrinsic_load_base_vertex:
2267       unreachable("should be lowered by nir_lower_system_values()");
2268
2269    case nir_intrinsic_load_input: {
2270       fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2271       unsigned first_component = nir_intrinsic_component(instr);
2272       unsigned num_components = instr->num_components;
2273
2274       src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2275
2276       if (type_sz(dest.type) == 8)
2277          first_component /= 2;
2278
2279       /* For 16-bit support maybe a temporary will be needed to copy from
2280        * the ATTR file.
2281        */
2282       shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
2283                               first_component, num_components);
2284       break;
2285    }
2286
2287    case nir_intrinsic_load_vertex_id_zero_base:
2288    case nir_intrinsic_load_instance_id:
2289    case nir_intrinsic_load_base_instance:
2290    case nir_intrinsic_load_draw_id:
2291    case nir_intrinsic_load_first_vertex:
2292    case nir_intrinsic_load_is_indexed_draw:
2293       unreachable("lowered by brw_nir_lower_vs_inputs");
2294
2295    default:
2296       nir_emit_intrinsic(bld, instr);
2297       break;
2298    }
2299 }
2300
2301 void
2302 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2303                                    nir_intrinsic_instr *instr)
2304 {
2305    assert(stage == MESA_SHADER_TESS_CTRL);
2306    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2307    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2308
2309    fs_reg dst;
2310    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2311       dst = get_nir_dest(instr->dest);
2312
2313    switch (instr->intrinsic) {
2314    case nir_intrinsic_load_primitive_id:
2315       bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2316       break;
2317    case nir_intrinsic_load_invocation_id:
2318       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2319       break;
2320    case nir_intrinsic_load_patch_vertices_in:
2321       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2322               brw_imm_d(tcs_key->input_vertices));
2323       break;
2324
2325    case nir_intrinsic_barrier: {
2326       if (tcs_prog_data->instances == 1)
2327          break;
2328
2329       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2330       fs_reg m0_2 = component(m0, 2);
2331
2332       const fs_builder chanbld = bld.exec_all().group(1, 0);
2333
2334       /* Zero the message header */
2335       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2336
2337       /* Copy "Barrier ID" from r0.2, bits 16:13 */
2338       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2339                   brw_imm_ud(INTEL_MASK(16, 13)));
2340
2341       /* Shift it up to bits 27:24. */
2342       chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2343
2344       /* Set the Barrier Count and the enable bit */
2345       chanbld.OR(m0_2, m0_2,
2346                  brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2347
2348       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2349       break;
2350    }
2351
2352    case nir_intrinsic_load_input:
2353       unreachable("nir_lower_io should never give us these.");
2354       break;
2355
2356    case nir_intrinsic_load_per_vertex_input: {
2357       fs_reg indirect_offset = get_indirect_offset(instr);
2358       unsigned imm_offset = instr->const_index[0];
2359
2360       const nir_src &vertex_src = instr->src[0];
2361
2362       fs_inst *inst;
2363
2364       fs_reg icp_handle;
2365
2366       if (nir_src_is_const(vertex_src)) {
2367          /* Emit a MOV to resolve <0,1,0> regioning. */
2368          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2369          unsigned vertex = nir_src_as_uint(vertex_src);
2370          bld.MOV(icp_handle,
2371                  retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2372                         BRW_REGISTER_TYPE_UD));
2373       } else if (tcs_prog_data->instances == 1 &&
2374                  vertex_src.is_ssa &&
2375                  vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2376                  nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2377          /* For the common case of only 1 instance, an array index of
2378           * gl_InvocationID means reading g1.  Skip all the indirect work.
2379           */
2380          icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2381       } else {
2382          /* The vertex index is non-constant.  We need to use indirect
2383           * addressing to fetch the proper URB handle.
2384           */
2385          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2386
2387          /* Each ICP handle is a single DWord (4 bytes) */
2388          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2389          bld.SHL(vertex_offset_bytes,
2390                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2391                  brw_imm_ud(2u));
2392
2393          /* Start at g1.  We might read up to 4 registers. */
2394          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2395                   retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2396                   brw_imm_ud(4 * REG_SIZE));
2397       }
2398
2399       /* We can only read two double components with each URB read, so
2400        * we send two read messages in that case, each one loading up to
2401        * two double components.
2402        */
2403       unsigned num_iterations = 1;
2404       unsigned num_components = instr->num_components;
2405       unsigned first_component = nir_intrinsic_component(instr);
2406       fs_reg orig_dst = dst;
2407       if (type_sz(dst.type) == 8) {
2408          first_component = first_component / 2;
2409          if (instr->num_components > 2) {
2410             num_iterations = 2;
2411             num_components = 2;
2412          }
2413
2414          fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2415          dst = tmp;
2416       }
2417
2418       for (unsigned iter = 0; iter < num_iterations; iter++) {
2419          if (indirect_offset.file == BAD_FILE) {
2420             /* Constant indexing - use global offset. */
2421             if (first_component != 0) {
2422                unsigned read_components = num_components + first_component;
2423                fs_reg tmp = bld.vgrf(dst.type, read_components);
2424                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2425                for (unsigned i = 0; i < num_components; i++) {
2426                   bld.MOV(offset(dst, bld, i),
2427                           offset(tmp, bld, i + first_component));
2428                }
2429             } else {
2430                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2431             }
2432             inst->offset = imm_offset;
2433             inst->mlen = 1;
2434          } else {
2435             /* Indirect indexing - use per-slot offsets as well. */
2436             const fs_reg srcs[] = { icp_handle, indirect_offset };
2437             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2438             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2439             if (first_component != 0) {
2440                unsigned read_components = num_components + first_component;
2441                fs_reg tmp = bld.vgrf(dst.type, read_components);
2442                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2443                                payload);
2444                for (unsigned i = 0; i < num_components; i++) {
2445                   bld.MOV(offset(dst, bld, i),
2446                           offset(tmp, bld, i + first_component));
2447                }
2448             } else {
2449                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2450                                payload);
2451             }
2452             inst->offset = imm_offset;
2453             inst->mlen = 2;
2454          }
2455          inst->size_written = (num_components + first_component) *
2456                               inst->dst.component_size(inst->exec_size);
2457
2458          /* If we are reading 64-bit data using 32-bit read messages we need
2459           * build proper 64-bit data elements by shuffling the low and high
2460           * 32-bit components around like we do for other things like UBOs
2461           * or SSBOs.
2462           */
2463          if (type_sz(dst.type) == 8) {
2464             shuffle_from_32bit_read(bld,
2465                                     offset(orig_dst, bld, iter * 2),
2466                                     retype(dst, BRW_REGISTER_TYPE_D),
2467                                     0, num_components);
2468          }
2469
2470          /* Copy the temporary to the destination to deal with writemasking.
2471           *
2472           * Also attempt to deal with gl_PointSize being in the .w component.
2473           */
2474          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2475             assert(type_sz(dst.type) < 8);
2476             inst->dst = bld.vgrf(dst.type, 4);
2477             inst->size_written = 4 * REG_SIZE;
2478             bld.MOV(dst, offset(inst->dst, bld, 3));
2479          }
2480
2481          /* If we are loading double data and we need a second read message
2482           * adjust the write offset
2483           */
2484          if (num_iterations > 1) {
2485             num_components = instr->num_components - 2;
2486             imm_offset++;
2487          }
2488       }
2489       break;
2490    }
2491
2492    case nir_intrinsic_load_output:
2493    case nir_intrinsic_load_per_vertex_output: {
2494       fs_reg indirect_offset = get_indirect_offset(instr);
2495       unsigned imm_offset = instr->const_index[0];
2496       unsigned first_component = nir_intrinsic_component(instr);
2497
2498       fs_inst *inst;
2499       if (indirect_offset.file == BAD_FILE) {
2500          /* Replicate the patch handle to all enabled channels */
2501          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2502          bld.MOV(patch_handle,
2503                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2504
2505          {
2506             if (first_component != 0) {
2507                unsigned read_components =
2508                   instr->num_components + first_component;
2509                fs_reg tmp = bld.vgrf(dst.type, read_components);
2510                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2511                                patch_handle);
2512                inst->size_written = read_components * REG_SIZE;
2513                for (unsigned i = 0; i < instr->num_components; i++) {
2514                   bld.MOV(offset(dst, bld, i),
2515                           offset(tmp, bld, i + first_component));
2516                }
2517             } else {
2518                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2519                                patch_handle);
2520                inst->size_written = instr->num_components * REG_SIZE;
2521             }
2522             inst->offset = imm_offset;
2523             inst->mlen = 1;
2524          }
2525       } else {
2526          /* Indirect indexing - use per-slot offsets as well. */
2527          const fs_reg srcs[] = {
2528             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2529             indirect_offset
2530          };
2531          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2532          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2533          if (first_component != 0) {
2534             unsigned read_components =
2535                instr->num_components + first_component;
2536             fs_reg tmp = bld.vgrf(dst.type, read_components);
2537             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2538                             payload);
2539             inst->size_written = read_components * REG_SIZE;
2540             for (unsigned i = 0; i < instr->num_components; i++) {
2541                bld.MOV(offset(dst, bld, i),
2542                        offset(tmp, bld, i + first_component));
2543             }
2544          } else {
2545             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2546                             payload);
2547             inst->size_written = instr->num_components * REG_SIZE;
2548          }
2549          inst->offset = imm_offset;
2550          inst->mlen = 2;
2551       }
2552       break;
2553    }
2554
2555    case nir_intrinsic_store_output:
2556    case nir_intrinsic_store_per_vertex_output: {
2557       fs_reg value = get_nir_src(instr->src[0]);
2558       bool is_64bit = (instr->src[0].is_ssa ?
2559          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2560       fs_reg indirect_offset = get_indirect_offset(instr);
2561       unsigned imm_offset = instr->const_index[0];
2562       unsigned mask = instr->const_index[1];
2563       unsigned header_regs = 0;
2564       fs_reg srcs[7];
2565       srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2566
2567       if (indirect_offset.file != BAD_FILE) {
2568          srcs[header_regs++] = indirect_offset;
2569       }
2570
2571       if (mask == 0)
2572          break;
2573
2574       unsigned num_components = util_last_bit(mask);
2575       enum opcode opcode;
2576
2577       /* We can only pack two 64-bit components in a single message, so send
2578        * 2 messages if we have more components
2579        */
2580       unsigned num_iterations = 1;
2581       unsigned iter_components = num_components;
2582       unsigned first_component = nir_intrinsic_component(instr);
2583       if (is_64bit) {
2584          first_component = first_component / 2;
2585          if (instr->num_components > 2) {
2586             num_iterations = 2;
2587             iter_components = 2;
2588          }
2589       }
2590
2591       mask = mask << first_component;
2592
2593       for (unsigned iter = 0; iter < num_iterations; iter++) {
2594          if (!is_64bit && mask != WRITEMASK_XYZW) {
2595             srcs[header_regs++] = brw_imm_ud(mask << 16);
2596             opcode = indirect_offset.file != BAD_FILE ?
2597                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2598                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2599          } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2600             /* Expand the 64-bit mask to 32-bit channels. We only handle
2601              * two channels in each iteration, so we only care about X/Y.
2602              */
2603             unsigned mask32 = 0;
2604             if (mask & WRITEMASK_X)
2605                mask32 |= WRITEMASK_XY;
2606             if (mask & WRITEMASK_Y)
2607                mask32 |= WRITEMASK_ZW;
2608
2609             /* If the mask does not include any of the channels X or Y there
2610              * is nothing to do in this iteration. Move on to the next couple
2611              * of 64-bit channels.
2612              */
2613             if (!mask32) {
2614                mask >>= 2;
2615                imm_offset++;
2616                continue;
2617             }
2618
2619             srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2620             opcode = indirect_offset.file != BAD_FILE ?
2621                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2622                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2623          } else {
2624             opcode = indirect_offset.file != BAD_FILE ?
2625                SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2626                SHADER_OPCODE_URB_WRITE_SIMD8;
2627          }
2628
2629          for (unsigned i = 0; i < iter_components; i++) {
2630             if (!(mask & (1 << (i + first_component))))
2631                continue;
2632
2633             if (!is_64bit) {
2634                srcs[header_regs + i + first_component] = offset(value, bld, i);
2635             } else {
2636                /* We need to shuffle the 64-bit data to match the layout
2637                 * expected by our 32-bit URB write messages. We use a temporary
2638                 * for that.
2639                 */
2640                unsigned channel = iter * 2 + i;
2641                fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
2642
2643                srcs[header_regs + (i + first_component) * 2] = dest;
2644                srcs[header_regs + (i + first_component) * 2 + 1] =
2645                   offset(dest, bld, 1);
2646             }
2647          }
2648
2649          unsigned mlen =
2650             header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2651             (is_64bit ? 2 * first_component : first_component);
2652          fs_reg payload =
2653             bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2654          bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2655
2656          fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2657          inst->offset = imm_offset;
2658          inst->mlen = mlen;
2659
2660          /* If this is a 64-bit attribute, select the next two 64-bit channels
2661           * to be handled in the next iteration.
2662           */
2663          if (is_64bit) {
2664             mask >>= 2;
2665             imm_offset++;
2666          }
2667       }
2668       break;
2669    }
2670
2671    default:
2672       nir_emit_intrinsic(bld, instr);
2673       break;
2674    }
2675 }
2676
2677 void
2678 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2679                                    nir_intrinsic_instr *instr)
2680 {
2681    assert(stage == MESA_SHADER_TESS_EVAL);
2682    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2683
2684    fs_reg dest;
2685    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2686       dest = get_nir_dest(instr->dest);
2687
2688    switch (instr->intrinsic) {
2689    case nir_intrinsic_load_primitive_id:
2690       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2691       break;
2692    case nir_intrinsic_load_tess_coord:
2693       /* gl_TessCoord is part of the payload in g1-3 */
2694       for (unsigned i = 0; i < 3; i++) {
2695          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2696       }
2697       break;
2698
2699    case nir_intrinsic_load_input:
2700    case nir_intrinsic_load_per_vertex_input: {
2701       fs_reg indirect_offset = get_indirect_offset(instr);
2702       unsigned imm_offset = instr->const_index[0];
2703       unsigned first_component = nir_intrinsic_component(instr);
2704
2705       if (type_sz(dest.type) == 8) {
2706          first_component = first_component / 2;
2707       }
2708
2709       fs_inst *inst;
2710       if (indirect_offset.file == BAD_FILE) {
2711          /* Arbitrarily only push up to 32 vec4 slots worth of data,
2712           * which is 16 registers (since each holds 2 vec4 slots).
2713           */
2714          unsigned slot_count = 1;
2715          if (type_sz(dest.type) == 8 && instr->num_components > 2)
2716             slot_count++;
2717
2718          const unsigned max_push_slots = 32;
2719          if (imm_offset + slot_count <= max_push_slots) {
2720             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2721             for (int i = 0; i < instr->num_components; i++) {
2722                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2723                   i + first_component;
2724                bld.MOV(offset(dest, bld, i), component(src, comp));
2725             }
2726
2727             tes_prog_data->base.urb_read_length =
2728                MAX2(tes_prog_data->base.urb_read_length,
2729                     DIV_ROUND_UP(imm_offset + slot_count, 2));
2730          } else {
2731             /* Replicate the patch handle to all enabled channels */
2732             const fs_reg srcs[] = {
2733                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2734             };
2735             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2736             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2737
2738             if (first_component != 0) {
2739                unsigned read_components =
2740                   instr->num_components + first_component;
2741                fs_reg tmp = bld.vgrf(dest.type, read_components);
2742                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2743                                patch_handle);
2744                inst->size_written = read_components * REG_SIZE;
2745                for (unsigned i = 0; i < instr->num_components; i++) {
2746                   bld.MOV(offset(dest, bld, i),
2747                           offset(tmp, bld, i + first_component));
2748                }
2749             } else {
2750                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2751                                patch_handle);
2752                inst->size_written = instr->num_components * REG_SIZE;
2753             }
2754             inst->mlen = 1;
2755             inst->offset = imm_offset;
2756          }
2757       } else {
2758          /* Indirect indexing - use per-slot offsets as well. */
2759
2760          /* We can only read two double components with each URB read, so
2761           * we send two read messages in that case, each one loading up to
2762           * two double components.
2763           */
2764          unsigned num_iterations = 1;
2765          unsigned num_components = instr->num_components;
2766          fs_reg orig_dest = dest;
2767          if (type_sz(dest.type) == 8) {
2768             if (instr->num_components > 2) {
2769                num_iterations = 2;
2770                num_components = 2;
2771             }
2772             fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2773             dest = tmp;
2774          }
2775
2776          for (unsigned iter = 0; iter < num_iterations; iter++) {
2777             const fs_reg srcs[] = {
2778                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2779                indirect_offset
2780             };
2781             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2782             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2783
2784             if (first_component != 0) {
2785                unsigned read_components =
2786                    num_components + first_component;
2787                fs_reg tmp = bld.vgrf(dest.type, read_components);
2788                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2789                                payload);
2790                for (unsigned i = 0; i < num_components; i++) {
2791                   bld.MOV(offset(dest, bld, i),
2792                           offset(tmp, bld, i + first_component));
2793                }
2794             } else {
2795                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2796                                payload);
2797             }
2798             inst->mlen = 2;
2799             inst->offset = imm_offset;
2800             inst->size_written = (num_components + first_component) *
2801                                  inst->dst.component_size(inst->exec_size);
2802
2803             /* If we are reading 64-bit data using 32-bit read messages we need
2804              * build proper 64-bit data elements by shuffling the low and high
2805              * 32-bit components around like we do for other things like UBOs
2806              * or SSBOs.
2807              */
2808             if (type_sz(dest.type) == 8) {
2809                shuffle_from_32bit_read(bld,
2810                                        offset(orig_dest, bld, iter * 2),
2811                                        retype(dest, BRW_REGISTER_TYPE_D),
2812                                        0, num_components);
2813             }
2814
2815             /* If we are loading double data and we need a second read message
2816              * adjust the offset
2817              */
2818             if (num_iterations > 1) {
2819                num_components = instr->num_components - 2;
2820                imm_offset++;
2821             }
2822          }
2823       }
2824       break;
2825    }
2826    default:
2827       nir_emit_intrinsic(bld, instr);
2828       break;
2829    }
2830 }
2831
2832 void
2833 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
2834                                   nir_intrinsic_instr *instr)
2835 {
2836    assert(stage == MESA_SHADER_GEOMETRY);
2837    fs_reg indirect_offset;
2838
2839    fs_reg dest;
2840    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2841       dest = get_nir_dest(instr->dest);
2842
2843    switch (instr->intrinsic) {
2844    case nir_intrinsic_load_primitive_id:
2845       assert(stage == MESA_SHADER_GEOMETRY);
2846       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
2847       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
2848               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
2849       break;
2850
2851    case nir_intrinsic_load_input:
2852       unreachable("load_input intrinsics are invalid for the GS stage");
2853
2854    case nir_intrinsic_load_per_vertex_input:
2855       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
2856                          instr->src[1], instr->num_components,
2857                          nir_intrinsic_component(instr));
2858       break;
2859
2860    case nir_intrinsic_emit_vertex_with_counter:
2861       emit_gs_vertex(instr->src[0], instr->const_index[0]);
2862       break;
2863
2864    case nir_intrinsic_end_primitive_with_counter:
2865       emit_gs_end_primitive(instr->src[0]);
2866       break;
2867
2868    case nir_intrinsic_set_vertex_count:
2869       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
2870       break;
2871
2872    case nir_intrinsic_load_invocation_id: {
2873       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
2874       assert(val.file != BAD_FILE);
2875       dest.type = val.type;
2876       bld.MOV(dest, val);
2877       break;
2878    }
2879
2880    default:
2881       nir_emit_intrinsic(bld, instr);
2882       break;
2883    }
2884 }
2885
2886 /**
2887  * Fetch the current render target layer index.
2888  */
2889 static fs_reg
2890 fetch_render_target_array_index(const fs_builder &bld)
2891 {
2892    if (bld.shader->devinfo->gen >= 6) {
2893       /* The render target array index is provided in the thread payload as
2894        * bits 26:16 of r0.0.
2895        */
2896       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
2897       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
2898               brw_imm_uw(0x7ff));
2899       return idx;
2900    } else {
2901       /* Pre-SNB we only ever render into the first layer of the framebuffer
2902        * since layered rendering is not implemented.
2903        */
2904       return brw_imm_ud(0);
2905    }
2906 }
2907
2908 /**
2909  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
2910  * framebuffer at the current fragment coordinates and sample index.
2911  */
2912 fs_inst *
2913 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
2914                                       unsigned target)
2915 {
2916    const struct gen_device_info *devinfo = bld.shader->devinfo;
2917
2918    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2919    const brw_wm_prog_key *wm_key =
2920       reinterpret_cast<const brw_wm_prog_key *>(key);
2921    assert(!wm_key->coherent_fb_fetch);
2922    const struct brw_wm_prog_data *wm_prog_data =
2923       brw_wm_prog_data(stage_prog_data);
2924
2925    /* Calculate the surface index relative to the start of the texture binding
2926     * table block, since that's what the texturing messages expect.
2927     */
2928    const unsigned surface = target +
2929       wm_prog_data->binding_table.render_target_read_start -
2930       wm_prog_data->base.binding_table.texture_start;
2931
2932    brw_mark_surface_used(
2933       bld.shader->stage_prog_data,
2934       wm_prog_data->binding_table.render_target_read_start + target);
2935
2936    /* Calculate the fragment coordinates. */
2937    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
2938    bld.MOV(offset(coords, bld, 0), pixel_x);
2939    bld.MOV(offset(coords, bld, 1), pixel_y);
2940    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
2941
2942    /* Calculate the sample index and MCS payload when multisampling.  Luckily
2943     * the MCS fetch message behaves deterministically for UMS surfaces, so it
2944     * shouldn't be necessary to recompile based on whether the framebuffer is
2945     * CMS or UMS.
2946     */
2947    if (wm_key->multisample_fbo &&
2948        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
2949       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
2950
2951    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
2952    const fs_reg mcs = wm_key->multisample_fbo ?
2953       emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
2954
2955    /* Use either a normal or a CMS texel fetch message depending on whether
2956     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
2957     * message just in case the framebuffer uses 16x multisampling, it should
2958     * be equivalent to the normal CMS fetch for lower multisampling modes.
2959     */
2960    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
2961                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
2962                      SHADER_OPCODE_TXF_CMS_LOGICAL;
2963
2964    /* Emit the instruction. */
2965    const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
2966                            sample, mcs,
2967                            brw_imm_ud(surface), brw_imm_ud(0),
2968                            fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
2969    STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
2970
2971    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
2972    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
2973
2974    return inst;
2975 }
2976
2977 /**
2978  * Actual coherent framebuffer read implemented using the native render target
2979  * read message.  Requires SKL+.
2980  */
2981 static fs_inst *
2982 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
2983 {
2984    assert(bld.shader->devinfo->gen >= 9);
2985    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
2986    inst->target = target;
2987    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
2988
2989    return inst;
2990 }
2991
2992 static fs_reg
2993 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
2994 {
2995    if (n && regs[0].file != BAD_FILE) {
2996       return regs[0];
2997
2998    } else {
2999       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3000
3001       for (unsigned i = 0; i < n; i++)
3002          regs[i] = tmp;
3003
3004       return tmp;
3005    }
3006 }
3007
3008 static fs_reg
3009 alloc_frag_output(fs_visitor *v, unsigned location)
3010 {
3011    assert(v->stage == MESA_SHADER_FRAGMENT);
3012    const brw_wm_prog_key *const key =
3013       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3014    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3015    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3016
3017    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3018       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3019
3020    else if (l == FRAG_RESULT_COLOR)
3021       return alloc_temporary(v->bld, 4, v->outputs,
3022                              MAX2(key->nr_color_regions, 1));
3023
3024    else if (l == FRAG_RESULT_DEPTH)
3025       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3026
3027    else if (l == FRAG_RESULT_STENCIL)
3028       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3029
3030    else if (l == FRAG_RESULT_SAMPLE_MASK)
3031       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3032
3033    else if (l >= FRAG_RESULT_DATA0 &&
3034             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3035       return alloc_temporary(v->bld, 4,
3036                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3037
3038    else
3039       unreachable("Invalid location");
3040 }
3041
3042 void
3043 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3044                                   nir_intrinsic_instr *instr)
3045 {
3046    assert(stage == MESA_SHADER_FRAGMENT);
3047
3048    fs_reg dest;
3049    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3050       dest = get_nir_dest(instr->dest);
3051
3052    switch (instr->intrinsic) {
3053    case nir_intrinsic_load_front_face:
3054       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3055               *emit_frontfacing_interpolation());
3056       break;
3057
3058    case nir_intrinsic_load_sample_pos: {
3059       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3060       assert(sample_pos.file != BAD_FILE);
3061       dest.type = sample_pos.type;
3062       bld.MOV(dest, sample_pos);
3063       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3064       break;
3065    }
3066
3067    case nir_intrinsic_load_layer_id:
3068       dest.type = BRW_REGISTER_TYPE_UD;
3069       bld.MOV(dest, fetch_render_target_array_index(bld));
3070       break;
3071
3072    case nir_intrinsic_load_helper_invocation:
3073    case nir_intrinsic_load_sample_mask_in:
3074    case nir_intrinsic_load_sample_id: {
3075       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3076       fs_reg val = nir_system_values[sv];
3077       assert(val.file != BAD_FILE);
3078       dest.type = val.type;
3079       bld.MOV(dest, val);
3080       break;
3081    }
3082
3083    case nir_intrinsic_store_output: {
3084       const fs_reg src = get_nir_src(instr->src[0]);
3085       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3086       const unsigned location = nir_intrinsic_base(instr) +
3087          SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3088       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3089                                      src.type);
3090
3091       for (unsigned j = 0; j < instr->num_components; j++)
3092          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3093                  offset(src, bld, j));
3094
3095       break;
3096    }
3097
3098    case nir_intrinsic_load_output: {
3099       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3100                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3101       assert(l >= FRAG_RESULT_DATA0);
3102       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3103       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3104       const fs_reg tmp = bld.vgrf(dest.type, 4);
3105
3106       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3107          emit_coherent_fb_read(bld, tmp, target);
3108       else
3109          emit_non_coherent_fb_read(bld, tmp, target);
3110
3111       for (unsigned j = 0; j < instr->num_components; j++) {
3112          bld.MOV(offset(dest, bld, j),
3113                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3114       }
3115
3116       break;
3117    }
3118
3119    case nir_intrinsic_discard:
3120    case nir_intrinsic_discard_if: {
3121       /* We track our discarded pixels in f0.1.  By predicating on it, we can
3122        * update just the flag bits that aren't yet discarded.  If there's no
3123        * condition, we emit a CMP of g0 != g0, so all currently executing
3124        * channels will get turned off.
3125        */
3126       fs_inst *cmp;
3127       if (instr->intrinsic == nir_intrinsic_discard_if) {
3128          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3129                        brw_imm_d(0), BRW_CONDITIONAL_Z);
3130       } else {
3131          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3132                                        BRW_REGISTER_TYPE_UW));
3133          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3134       }
3135       cmp->predicate = BRW_PREDICATE_NORMAL;
3136       cmp->flag_subreg = 1;
3137
3138       if (devinfo->gen >= 6) {
3139          emit_discard_jump();
3140       }
3141
3142       limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode.");
3143       break;
3144    }
3145
3146    case nir_intrinsic_load_input: {
3147       /* load_input is only used for flat inputs */
3148       unsigned base = nir_intrinsic_base(instr);
3149       unsigned comp = nir_intrinsic_component(instr);
3150       unsigned num_components = instr->num_components;
3151       fs_reg orig_dest = dest;
3152       enum brw_reg_type type = dest.type;
3153
3154       /* Special case fields in the VUE header */
3155       if (base == VARYING_SLOT_LAYER)
3156          comp = 1;
3157       else if (base == VARYING_SLOT_VIEWPORT)
3158          comp = 2;
3159
3160       if (nir_dest_bit_size(instr->dest) == 64) {
3161          /* const_index is in 32-bit type size units that could not be aligned
3162           * with DF. We need to read the double vector as if it was a float
3163           * vector of twice the number of components to fetch the right data.
3164           */
3165          type = BRW_REGISTER_TYPE_F;
3166          num_components *= 2;
3167          dest = bld.vgrf(type, num_components);
3168       }
3169
3170       for (unsigned int i = 0; i < num_components; i++) {
3171          bld.MOV(offset(retype(dest, type), bld, i),
3172                  retype(component(interp_reg(base, comp + i), 3), type));
3173       }
3174
3175       if (nir_dest_bit_size(instr->dest) == 64) {
3176          shuffle_from_32bit_read(bld, orig_dest, dest, 0,
3177                                  instr->num_components);
3178       }
3179       break;
3180    }
3181
3182    case nir_intrinsic_load_barycentric_pixel:
3183    case nir_intrinsic_load_barycentric_centroid:
3184    case nir_intrinsic_load_barycentric_sample:
3185       /* Do nothing - load_interpolated_input handling will handle it later. */
3186       break;
3187
3188    case nir_intrinsic_load_barycentric_at_sample: {
3189       const glsl_interp_mode interpolation =
3190          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3191
3192       if (nir_src_is_const(instr->src[0])) {
3193          unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3194
3195          emit_pixel_interpolater_send(bld,
3196                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3197                                       dest,
3198                                       fs_reg(), /* src */
3199                                       brw_imm_ud(msg_data),
3200                                       interpolation);
3201       } else {
3202          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3203                                           BRW_REGISTER_TYPE_UD);
3204
3205          if (nir_src_is_dynamically_uniform(instr->src[0])) {
3206             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3207             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3208             bld.exec_all().group(1, 0)
3209                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3210             emit_pixel_interpolater_send(bld,
3211                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3212                                          dest,
3213                                          fs_reg(), /* src */
3214                                          msg_data,
3215                                          interpolation);
3216          } else {
3217             /* Make a loop that sends a message to the pixel interpolater
3218              * for the sample number in each live channel. If there are
3219              * multiple channels with the same sample number then these
3220              * will be handled simultaneously with a single interation of
3221              * the loop.
3222              */
3223             bld.emit(BRW_OPCODE_DO);
3224
3225             /* Get the next live sample number into sample_id_reg */
3226             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3227
3228             /* Set the flag register so that we can perform the send
3229              * message on all channels that have the same sample number
3230              */
3231             bld.CMP(bld.null_reg_ud(),
3232                     sample_src, sample_id,
3233                     BRW_CONDITIONAL_EQ);
3234             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3235             bld.exec_all().group(1, 0)
3236                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3237             fs_inst *inst =
3238                emit_pixel_interpolater_send(bld,
3239                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3240                                             dest,
3241                                             fs_reg(), /* src */
3242                                             component(msg_data, 0),
3243                                             interpolation);
3244             set_predicate(BRW_PREDICATE_NORMAL, inst);
3245
3246             /* Continue the loop if there are any live channels left */
3247             set_predicate_inv(BRW_PREDICATE_NORMAL,
3248                               true, /* inverse */
3249                               bld.emit(BRW_OPCODE_WHILE));
3250          }
3251       }
3252       break;
3253    }
3254
3255    case nir_intrinsic_load_barycentric_at_offset: {
3256       const glsl_interp_mode interpolation =
3257          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3258
3259       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3260
3261       if (const_offset) {
3262          assert(nir_src_bit_size(instr->src[0]) == 32);
3263          unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
3264          unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
3265
3266          emit_pixel_interpolater_send(bld,
3267                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3268                                       dest,
3269                                       fs_reg(), /* src */
3270                                       brw_imm_ud(off_x | (off_y << 4)),
3271                                       interpolation);
3272       } else {
3273          fs_reg src = vgrf(glsl_type::ivec2_type);
3274          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3275                                     BRW_REGISTER_TYPE_F);
3276          for (int i = 0; i < 2; i++) {
3277             fs_reg temp = vgrf(glsl_type::float_type);
3278             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3279             fs_reg itemp = vgrf(glsl_type::int_type);
3280             /* float to int */
3281             bld.MOV(itemp, temp);
3282
3283             /* Clamp the upper end of the range to +7/16.
3284              * ARB_gpu_shader5 requires that we support a maximum offset
3285              * of +0.5, which isn't representable in a S0.4 value -- if
3286              * we didn't clamp it, we'd end up with -8/16, which is the
3287              * opposite of what the shader author wanted.
3288              *
3289              * This is legal due to ARB_gpu_shader5's quantization
3290              * rules:
3291              *
3292              * "Not all values of <offset> may be supported; x and y
3293              * offsets may be rounded to fixed-point values with the
3294              * number of fraction bits given by the
3295              * implementation-dependent constant
3296              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3297              */
3298             set_condmod(BRW_CONDITIONAL_L,
3299                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3300          }
3301
3302          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3303          emit_pixel_interpolater_send(bld,
3304                                       opcode,
3305                                       dest,
3306                                       src,
3307                                       brw_imm_ud(0u),
3308                                       interpolation);
3309       }
3310       break;
3311    }
3312
3313    case nir_intrinsic_load_interpolated_input: {
3314       if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3315          emit_fragcoord_interpolation(dest);
3316          break;
3317       }
3318
3319       assert(instr->src[0].ssa &&
3320              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3321       nir_intrinsic_instr *bary_intrinsic =
3322          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3323       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3324       enum glsl_interp_mode interp_mode =
3325          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3326       fs_reg dst_xy;
3327
3328       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3329           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3330          /* Use the result of the PI message */
3331          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3332       } else {
3333          /* Use the delta_xy values computed from the payload */
3334          enum brw_barycentric_mode bary =
3335             brw_barycentric_mode(interp_mode, bary_intrin);
3336
3337          dst_xy = this->delta_xy[bary];
3338       }
3339
3340       for (unsigned int i = 0; i < instr->num_components; i++) {
3341          fs_reg interp =
3342             component(interp_reg(nir_intrinsic_base(instr),
3343                                  nir_intrinsic_component(instr) + i), 0);
3344          interp.type = BRW_REGISTER_TYPE_F;
3345          dest.type = BRW_REGISTER_TYPE_F;
3346
3347          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3348             fs_reg tmp = vgrf(glsl_type::float_type);
3349             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3350             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3351          } else {
3352             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3353          }
3354       }
3355       break;
3356    }
3357
3358    default:
3359       nir_emit_intrinsic(bld, instr);
3360       break;
3361    }
3362 }
3363
3364 static int
3365 get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
3366 {
3367    if (nir_src_is_const(instr->src[src])) {
3368       int64_t add_val = nir_src_as_int(instr->src[src]);
3369       if (add_val == 1)
3370          return BRW_AOP_INC;
3371       else if (add_val == -1)
3372          return BRW_AOP_DEC;
3373    }
3374
3375    return BRW_AOP_ADD;
3376 }
3377
3378 void
3379 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3380                                   nir_intrinsic_instr *instr)
3381 {
3382    assert(stage == MESA_SHADER_COMPUTE);
3383    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3384
3385    fs_reg dest;
3386    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3387       dest = get_nir_dest(instr->dest);
3388
3389    switch (instr->intrinsic) {
3390    case nir_intrinsic_barrier:
3391       emit_barrier();
3392       cs_prog_data->uses_barrier = true;
3393       break;
3394
3395    case nir_intrinsic_load_subgroup_id:
3396       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3397       break;
3398
3399    case nir_intrinsic_load_local_invocation_id:
3400    case nir_intrinsic_load_work_group_id: {
3401       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3402       fs_reg val = nir_system_values[sv];
3403       assert(val.file != BAD_FILE);
3404       dest.type = val.type;
3405       for (unsigned i = 0; i < 3; i++)
3406          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3407       break;
3408    }
3409
3410    case nir_intrinsic_load_num_work_groups: {
3411       const unsigned surface =
3412          cs_prog_data->binding_table.work_groups_start;
3413
3414       cs_prog_data->uses_num_work_groups = true;
3415
3416       fs_reg surf_index = brw_imm_ud(surface);
3417       brw_mark_surface_used(prog_data, surface);
3418
3419       /* Read the 3 GLuint components of gl_NumWorkGroups */
3420       for (unsigned i = 0; i < 3; i++) {
3421          fs_reg read_result =
3422             emit_untyped_read(bld, surf_index,
3423                               brw_imm_ud(i << 2),
3424                               1 /* dims */, 1 /* size */,
3425                               BRW_PREDICATE_NONE);
3426          read_result.type = dest.type;
3427          bld.MOV(dest, read_result);
3428          dest = offset(dest, bld, 1);
3429       }
3430       break;
3431    }
3432
3433    case nir_intrinsic_shared_atomic_add:
3434       nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
3435       break;
3436    case nir_intrinsic_shared_atomic_imin:
3437       nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3438       break;
3439    case nir_intrinsic_shared_atomic_umin:
3440       nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3441       break;
3442    case nir_intrinsic_shared_atomic_imax:
3443       nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3444       break;
3445    case nir_intrinsic_shared_atomic_umax:
3446       nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3447       break;
3448    case nir_intrinsic_shared_atomic_and:
3449       nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3450       break;
3451    case nir_intrinsic_shared_atomic_or:
3452       nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3453       break;
3454    case nir_intrinsic_shared_atomic_xor:
3455       nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3456       break;
3457    case nir_intrinsic_shared_atomic_exchange:
3458       nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3459       break;
3460    case nir_intrinsic_shared_atomic_comp_swap:
3461       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3462       break;
3463    case nir_intrinsic_shared_atomic_fmin:
3464       nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr);
3465       break;
3466    case nir_intrinsic_shared_atomic_fmax:
3467       nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr);
3468       break;
3469    case nir_intrinsic_shared_atomic_fcomp_swap:
3470       nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr);
3471       break;
3472
3473    case nir_intrinsic_load_shared: {
3474       assert(devinfo->gen >= 7);
3475       assert(stage == MESA_SHADER_COMPUTE);
3476
3477       const unsigned bit_size = nir_dest_bit_size(instr->dest);
3478       fs_reg offset_reg = retype(get_nir_src(instr->src[0]),
3479                                  BRW_REGISTER_TYPE_UD);
3480
3481       /* Make dest unsigned because that's what the temporary will be */
3482       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3483
3484       /* Read the vector */
3485       if (nir_intrinsic_align(instr) >= 4) {
3486          assert(nir_dest_bit_size(instr->dest) == 32);
3487          fs_reg read_result = emit_untyped_read(bld, brw_imm_ud(GEN7_BTI_SLM),
3488                                                 offset_reg, 1 /* dims */,
3489                                                 instr->num_components,
3490                                                 BRW_PREDICATE_NONE);
3491          for (unsigned i = 0; i < instr->num_components; i++)
3492             bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
3493       } else {
3494          assert(nir_dest_bit_size(instr->dest) <= 32);
3495          assert(nir_dest_num_components(instr->dest) == 1);
3496          fs_reg read_result =
3497             emit_byte_scattered_read(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg,
3498                                      1 /* dims */, 1, bit_size,
3499                                      BRW_PREDICATE_NONE);
3500          bld.MOV(dest, read_result);
3501       }
3502       break;
3503    }
3504
3505    case nir_intrinsic_store_shared: {
3506       assert(devinfo->gen >= 7);
3507       assert(stage == MESA_SHADER_COMPUTE);
3508
3509       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3510       fs_reg val_reg = get_nir_src(instr->src[0]);
3511       fs_reg offset_reg = retype(get_nir_src(instr->src[1]),
3512                                  BRW_REGISTER_TYPE_UD);
3513
3514       val_reg.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3515
3516       assert(nir_intrinsic_write_mask(instr) ==
3517              (1 << instr->num_components) - 1);
3518       if (nir_intrinsic_align(instr) >= 4) {
3519          assert(nir_src_bit_size(instr->src[0]) == 32);
3520          assert(nir_src_num_components(instr->src[0]) <= 4);
3521          emit_untyped_write(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg, val_reg,
3522                             1 /* dims */, instr->num_components,
3523                             BRW_PREDICATE_NONE);
3524       } else {
3525          assert(nir_src_bit_size(instr->src[0]) <= 32);
3526          assert(nir_src_num_components(instr->src[0]) == 1);
3527          fs_reg write_src = bld.vgrf(BRW_REGISTER_TYPE_UD);
3528          bld.MOV(write_src, val_reg);
3529          emit_byte_scattered_write(bld, brw_imm_ud(GEN7_BTI_SLM), offset_reg,
3530                                    write_src, 1 /* dims */, bit_size,
3531                                    BRW_PREDICATE_NONE);
3532       }
3533       break;
3534    }
3535
3536    default:
3537       nir_emit_intrinsic(bld, instr);
3538       break;
3539    }
3540 }
3541
3542 static fs_reg
3543 brw_nir_reduction_op_identity(const fs_builder &bld,
3544                               nir_op op, brw_reg_type type)
3545 {
3546    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
3547    switch (type_sz(type)) {
3548    case 2:
3549       assert(type != BRW_REGISTER_TYPE_HF);
3550       return retype(brw_imm_uw(value.u16[0]), type);
3551    case 4:
3552       return retype(brw_imm_ud(value.u32[0]), type);
3553    case 8:
3554       if (type == BRW_REGISTER_TYPE_DF)
3555          return setup_imm_df(bld, value.f64[0]);
3556       else
3557          return retype(brw_imm_u64(value.u64[0]), type);
3558    default:
3559       unreachable("Invalid type size");
3560    }
3561 }
3562
3563 static opcode
3564 brw_op_for_nir_reduction_op(nir_op op)
3565 {
3566    switch (op) {
3567    case nir_op_iadd: return BRW_OPCODE_ADD;
3568    case nir_op_fadd: return BRW_OPCODE_ADD;
3569    case nir_op_imul: return BRW_OPCODE_MUL;
3570    case nir_op_fmul: return BRW_OPCODE_MUL;
3571    case nir_op_imin: return BRW_OPCODE_SEL;
3572    case nir_op_umin: return BRW_OPCODE_SEL;
3573    case nir_op_fmin: return BRW_OPCODE_SEL;
3574    case nir_op_imax: return BRW_OPCODE_SEL;
3575    case nir_op_umax: return BRW_OPCODE_SEL;
3576    case nir_op_fmax: return BRW_OPCODE_SEL;
3577    case nir_op_iand: return BRW_OPCODE_AND;
3578    case nir_op_ior:  return BRW_OPCODE_OR;
3579    case nir_op_ixor: return BRW_OPCODE_XOR;
3580    default:
3581       unreachable("Invalid reduction operation");
3582    }
3583 }
3584
3585 static brw_conditional_mod
3586 brw_cond_mod_for_nir_reduction_op(nir_op op)
3587 {
3588    switch (op) {
3589    case nir_op_iadd: return BRW_CONDITIONAL_NONE;
3590    case nir_op_fadd: return BRW_CONDITIONAL_NONE;
3591    case nir_op_imul: return BRW_CONDITIONAL_NONE;
3592    case nir_op_fmul: return BRW_CONDITIONAL_NONE;
3593    case nir_op_imin: return BRW_CONDITIONAL_L;
3594    case nir_op_umin: return BRW_CONDITIONAL_L;
3595    case nir_op_fmin: return BRW_CONDITIONAL_L;
3596    case nir_op_imax: return BRW_CONDITIONAL_GE;
3597    case nir_op_umax: return BRW_CONDITIONAL_GE;
3598    case nir_op_fmax: return BRW_CONDITIONAL_GE;
3599    case nir_op_iand: return BRW_CONDITIONAL_NONE;
3600    case nir_op_ior:  return BRW_CONDITIONAL_NONE;
3601    case nir_op_ixor: return BRW_CONDITIONAL_NONE;
3602    default:
3603       unreachable("Invalid reduction operation");
3604    }
3605 }
3606
3607 fs_reg
3608 fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
3609                                           nir_intrinsic_instr *instr)
3610 {
3611    fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
3612
3613    if (stage_prog_data->binding_table.image_start > 0) {
3614       if (image.file == BRW_IMMEDIATE_VALUE) {
3615          image.d += stage_prog_data->binding_table.image_start;
3616       } else {
3617          bld.ADD(image, image,
3618                  brw_imm_d(stage_prog_data->binding_table.image_start));
3619       }
3620    }
3621
3622    return bld.emit_uniformize(image);
3623 }
3624
3625 fs_reg
3626 fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
3627                                          nir_intrinsic_instr *instr)
3628 {
3629    /* SSBO stores are weird in that their index is in src[1] */
3630    const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
3631
3632    fs_reg surf_index;
3633    if (nir_src_is_const(instr->src[src])) {
3634       unsigned index = stage_prog_data->binding_table.ssbo_start +
3635                        nir_src_as_uint(instr->src[src]);
3636       surf_index = brw_imm_ud(index);
3637       brw_mark_surface_used(prog_data, index);
3638    } else {
3639       surf_index = vgrf(glsl_type::uint_type);
3640       bld.ADD(surf_index, get_nir_src(instr->src[src]),
3641               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3642
3643       /* Assume this may touch any UBO. It would be nice to provide
3644        * a tighter bound, but the array information is already lowered away.
3645        */
3646       brw_mark_surface_used(prog_data,
3647                             stage_prog_data->binding_table.ssbo_start +
3648                             nir->info.num_ssbos - 1);
3649    }
3650
3651    return surf_index;
3652 }
3653
3654 static unsigned
3655 image_intrinsic_coord_components(nir_intrinsic_instr *instr)
3656 {
3657    switch (nir_intrinsic_image_dim(instr)) {
3658    case GLSL_SAMPLER_DIM_1D:
3659       return 1 + nir_intrinsic_image_array(instr);
3660    case GLSL_SAMPLER_DIM_2D:
3661    case GLSL_SAMPLER_DIM_RECT:
3662       return 2 + nir_intrinsic_image_array(instr);
3663    case GLSL_SAMPLER_DIM_3D:
3664    case GLSL_SAMPLER_DIM_CUBE:
3665       return 3;
3666    case GLSL_SAMPLER_DIM_BUF:
3667       return 1;
3668    case GLSL_SAMPLER_DIM_MS:
3669       return 2 + nir_intrinsic_image_array(instr);
3670    default:
3671       unreachable("Invalid image dimension");
3672    }
3673 }
3674
3675 void
3676 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3677 {
3678    fs_reg dest;
3679    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3680       dest = get_nir_dest(instr->dest);
3681
3682    switch (instr->intrinsic) {
3683    case nir_intrinsic_image_load:
3684    case nir_intrinsic_image_store:
3685    case nir_intrinsic_image_atomic_add:
3686    case nir_intrinsic_image_atomic_min:
3687    case nir_intrinsic_image_atomic_max:
3688    case nir_intrinsic_image_atomic_and:
3689    case nir_intrinsic_image_atomic_or:
3690    case nir_intrinsic_image_atomic_xor:
3691    case nir_intrinsic_image_atomic_exchange:
3692    case nir_intrinsic_image_atomic_comp_swap: {
3693       if (stage == MESA_SHADER_FRAGMENT &&
3694           instr->intrinsic != nir_intrinsic_image_load)
3695          brw_wm_prog_data(prog_data)->has_side_effects = true;
3696
3697       /* Get some metadata from the image intrinsic. */
3698       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3699       const unsigned dims = image_intrinsic_coord_components(instr);
3700       const GLenum format = nir_intrinsic_format(instr);
3701       const unsigned dest_components = nir_intrinsic_dest_components(instr);
3702
3703       /* Get the arguments of the image intrinsic. */
3704       const fs_reg image = get_nir_image_intrinsic_image(bld, instr);
3705       const fs_reg coords = retype(get_nir_src(instr->src[1]),
3706                                    BRW_REGISTER_TYPE_UD);
3707       fs_reg tmp;
3708
3709       /* Emit an image load, store or atomic op. */
3710       if (instr->intrinsic == nir_intrinsic_image_load) {
3711          tmp = emit_typed_read(bld, image, coords, dims,
3712                                instr->num_components);
3713       } else if (instr->intrinsic == nir_intrinsic_image_store) {
3714          const fs_reg src0 = get_nir_src(instr->src[3]);
3715          emit_typed_write(bld, image, coords, src0, dims,
3716                           instr->num_components);
3717       } else {
3718          int op;
3719          unsigned num_srcs = info->num_srcs;
3720
3721          switch (instr->intrinsic) {
3722          case nir_intrinsic_image_atomic_add:
3723             assert(num_srcs == 4);
3724
3725             op = get_op_for_atomic_add(instr, 3);
3726
3727             if (op != BRW_AOP_ADD)
3728                num_srcs = 3;
3729             break;
3730          case nir_intrinsic_image_atomic_min:
3731             assert(format == GL_R32UI || format == GL_R32I);
3732             op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN;
3733             break;
3734          case nir_intrinsic_image_atomic_max:
3735             assert(format == GL_R32UI || format == GL_R32I);
3736             op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX;
3737             break;
3738          case nir_intrinsic_image_atomic_and:
3739             op = BRW_AOP_AND;
3740             break;
3741          case nir_intrinsic_image_atomic_or:
3742             op = BRW_AOP_OR;
3743             break;
3744          case nir_intrinsic_image_atomic_xor:
3745             op = BRW_AOP_XOR;
3746             break;
3747          case nir_intrinsic_image_atomic_exchange:
3748             op = BRW_AOP_MOV;
3749             break;
3750          case nir_intrinsic_image_atomic_comp_swap:
3751             op = BRW_AOP_CMPWR;
3752             break;
3753          default:
3754             unreachable("Not reachable.");
3755          }
3756
3757          const fs_reg src0 = (num_srcs >= 4 ?
3758                               get_nir_src(instr->src[3]) : fs_reg());
3759          const fs_reg src1 = (num_srcs >= 5 ?
3760                               get_nir_src(instr->src[4]) : fs_reg());
3761
3762          tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1, op);
3763       }
3764
3765       /* Assign the result. */
3766       for (unsigned c = 0; c < dest_components; ++c) {
3767          bld.MOV(offset(retype(dest, tmp.type), bld, c),
3768                  offset(tmp, bld, c));
3769       }
3770       break;
3771    }
3772
3773    case nir_intrinsic_image_size: {
3774       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
3775        * into will handle the binding table index for us in the geneerator.
3776        */
3777       fs_reg image = retype(get_nir_src_imm(instr->src[0]),
3778                             BRW_REGISTER_TYPE_UD);
3779       image = bld.emit_uniformize(image);
3780
3781       /* Since the image size is always uniform, we can just emit a SIMD8
3782        * query instruction and splat the result out.
3783        */
3784       const fs_builder ubld = bld.exec_all().group(8, 0);
3785
3786       /* The LOD also serves as the message payload */
3787       fs_reg lod = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3788       ubld.MOV(lod, brw_imm_ud(0));
3789
3790       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
3791       fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE, tmp, lod, image);
3792       inst->mlen = 1;
3793       inst->size_written = 4 * REG_SIZE;
3794
3795       for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
3796          if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {
3797             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3798                      offset(retype(dest, tmp.type), bld, c),
3799                      component(offset(tmp, ubld, c), 0), brw_imm_ud(6));
3800          } else {
3801             bld.MOV(offset(retype(dest, tmp.type), bld, c),
3802                     component(offset(tmp, ubld, c), 0));
3803          }
3804       }
3805       break;
3806    }
3807
3808    case nir_intrinsic_image_load_raw_intel: {
3809       const fs_reg image = get_nir_image_intrinsic_image(bld, instr);
3810       const fs_reg addr = retype(get_nir_src(instr->src[1]),
3811                                  BRW_REGISTER_TYPE_UD);
3812
3813       fs_reg tmp = emit_untyped_read(bld, image, addr, 1,
3814                                      instr->num_components);
3815
3816       for (unsigned c = 0; c < instr->num_components; ++c) {
3817          bld.MOV(offset(retype(dest, tmp.type), bld, c),
3818                  offset(tmp, bld, c));
3819       }
3820       break;
3821    }
3822
3823    case nir_intrinsic_image_store_raw_intel: {
3824       const fs_reg image = get_nir_image_intrinsic_image(bld, instr);
3825       const fs_reg addr = retype(get_nir_src(instr->src[1]),
3826                                  BRW_REGISTER_TYPE_UD);
3827       const fs_reg data = retype(get_nir_src(instr->src[2]),
3828                                  BRW_REGISTER_TYPE_UD);
3829
3830       brw_wm_prog_data(prog_data)->has_side_effects = true;
3831
3832       emit_untyped_write(bld, image, addr, data, 1,
3833                          instr->num_components);
3834       break;
3835    }
3836
3837    case nir_intrinsic_group_memory_barrier:
3838    case nir_intrinsic_memory_barrier_shared:
3839    case nir_intrinsic_memory_barrier_atomic_counter:
3840    case nir_intrinsic_memory_barrier_buffer:
3841    case nir_intrinsic_memory_barrier_image:
3842    case nir_intrinsic_memory_barrier: {
3843       const fs_builder ubld = bld.group(8, 0);
3844       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3845       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3846          ->size_written = 2 * REG_SIZE;
3847       break;
3848    }
3849
3850    case nir_intrinsic_shader_clock: {
3851       /* We cannot do anything if there is an event, so ignore it for now */
3852       const fs_reg shader_clock = get_timestamp(bld);
3853       const fs_reg srcs[] = { component(shader_clock, 0),
3854                               component(shader_clock, 1) };
3855       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3856       break;
3857    }
3858
3859    case nir_intrinsic_image_samples:
3860       /* The driver does not support multi-sampled images. */
3861       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
3862       break;
3863
3864    case nir_intrinsic_load_uniform: {
3865       /* Offsets are in bytes but they should always aligned to
3866        * the type size
3867        */
3868       assert(instr->const_index[0] % 4 == 0 ||
3869              instr->const_index[0] % type_sz(dest.type) == 0);
3870
3871       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
3872
3873       if (nir_src_is_const(instr->src[0])) {
3874          unsigned load_offset = nir_src_as_uint(instr->src[0]);
3875          assert(load_offset % type_sz(dest.type) == 0);
3876          /* For 16-bit types we add the module of the const_index[0]
3877           * offset to access to not 32-bit aligned element
3878           */
3879          src.offset = load_offset + instr->const_index[0] % 4;
3880
3881          for (unsigned j = 0; j < instr->num_components; j++) {
3882             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
3883          }
3884       } else {
3885          fs_reg indirect = retype(get_nir_src(instr->src[0]),
3886                                   BRW_REGISTER_TYPE_UD);
3887
3888          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
3889           * go past the end of the uniform.  In order to keep the n'th
3890           * component from running past, we subtract off the size of all but
3891           * one component of the vector.
3892           */
3893          assert(instr->const_index[1] >=
3894                 instr->num_components * (int) type_sz(dest.type));
3895          unsigned read_size = instr->const_index[1] -
3896             (instr->num_components - 1) * type_sz(dest.type);
3897
3898          bool supports_64bit_indirects =
3899             !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
3900
3901          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
3902             for (unsigned j = 0; j < instr->num_components; j++) {
3903                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3904                         offset(dest, bld, j), offset(src, bld, j),
3905                         indirect, brw_imm_ud(read_size));
3906             }
3907          } else {
3908             const unsigned num_mov_indirects =
3909                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
3910             /* We read a little bit less per MOV INDIRECT, as they are now
3911              * 32-bits ones instead of 64-bit. Fix read_size then.
3912              */
3913             const unsigned read_size_32bit = read_size -
3914                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
3915             for (unsigned j = 0; j < instr->num_components; j++) {
3916                for (unsigned i = 0; i < num_mov_indirects; i++) {
3917                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3918                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
3919                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
3920                            indirect, brw_imm_ud(read_size_32bit));
3921                }
3922             }
3923          }
3924       }
3925       break;
3926    }
3927
3928    case nir_intrinsic_load_ubo: {
3929       fs_reg surf_index;
3930       if (nir_src_is_const(instr->src[0])) {
3931          const unsigned index = stage_prog_data->binding_table.ubo_start +
3932                                 nir_src_as_uint(instr->src[0]);
3933          surf_index = brw_imm_ud(index);
3934          brw_mark_surface_used(prog_data, index);
3935       } else {
3936          /* The block index is not a constant. Evaluate the index expression
3937           * per-channel and add the base UBO index; we have to select a value
3938           * from any live channel.
3939           */
3940          surf_index = vgrf(glsl_type::uint_type);
3941          bld.ADD(surf_index, get_nir_src(instr->src[0]),
3942                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
3943          surf_index = bld.emit_uniformize(surf_index);
3944
3945          /* Assume this may touch any UBO. It would be nice to provide
3946           * a tighter bound, but the array information is already lowered away.
3947           */
3948          brw_mark_surface_used(prog_data,
3949                                stage_prog_data->binding_table.ubo_start +
3950                                nir->info.num_ubos - 1);
3951       }
3952
3953       if (!nir_src_is_const(instr->src[1])) {
3954          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
3955                                      BRW_REGISTER_TYPE_UD);
3956
3957          for (int i = 0; i < instr->num_components; i++)
3958             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
3959                                        base_offset, i * type_sz(dest.type));
3960       } else {
3961          /* Even if we are loading doubles, a pull constant load will load
3962           * a 32-bit vec4, so should only reserve vgrf space for that. If we
3963           * need to load a full dvec4 we will have to emit 2 loads. This is
3964           * similar to demote_pull_constants(), except that in that case we
3965           * see individual accesses to each component of the vector and then
3966           * we let CSE deal with duplicate loads. Here we see a vector access
3967           * and we have to split it if necessary.
3968           */
3969          const unsigned type_size = type_sz(dest.type);
3970          const unsigned load_offset = nir_src_as_uint(instr->src[1]);
3971
3972          /* See if we've selected this as a push constant candidate */
3973          if (nir_src_is_const(instr->src[0])) {
3974             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
3975             const unsigned offset_256b = load_offset / 32;
3976
3977             fs_reg push_reg;
3978             for (int i = 0; i < 4; i++) {
3979                const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
3980                if (range->block == ubo_block &&
3981                    offset_256b >= range->start &&
3982                    offset_256b < range->start + range->length) {
3983
3984                   push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
3985                   push_reg.offset = load_offset - 32 * range->start;
3986                   break;
3987                }
3988             }
3989
3990             if (push_reg.file != BAD_FILE) {
3991                for (unsigned i = 0; i < instr->num_components; i++) {
3992                   bld.MOV(offset(dest, bld, i),
3993                           byte_offset(push_reg, i * type_size));
3994                }
3995                break;
3996             }
3997          }
3998
3999          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4000          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4001          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4002
4003          for (unsigned c = 0; c < instr->num_components;) {
4004             const unsigned base = load_offset + c * type_size;
4005             /* Number of usable components in the next block-aligned load. */
4006             const unsigned count = MIN2(instr->num_components - c,
4007                                         (block_sz - base % block_sz) / type_size);
4008
4009             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4010                       packed_consts, surf_index,
4011                       brw_imm_ud(base & ~(block_sz - 1)));
4012
4013             const fs_reg consts =
4014                retype(byte_offset(packed_consts, base & (block_sz - 1)),
4015                       dest.type);
4016
4017             for (unsigned d = 0; d < count; d++)
4018                bld.MOV(offset(dest, bld, c + d), component(consts, d));
4019
4020             c += count;
4021          }
4022       }
4023       break;
4024    }
4025
4026    case nir_intrinsic_load_ssbo: {
4027       assert(devinfo->gen >= 7);
4028
4029       const unsigned bit_size = nir_dest_bit_size(instr->dest);
4030       fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr);
4031       fs_reg offset_reg = retype(get_nir_src(instr->src[1]),
4032                                  BRW_REGISTER_TYPE_UD);
4033
4034       /* Make dest unsigned because that's what the temporary will be */
4035       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4036
4037       /* Read the vector */
4038       if (nir_intrinsic_align(instr) >= 4) {
4039          assert(nir_dest_bit_size(instr->dest) == 32);
4040          fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
4041                                                 1 /* dims */,
4042                                                 instr->num_components,
4043                                                 BRW_PREDICATE_NONE);
4044          for (unsigned i = 0; i < instr->num_components; i++)
4045             bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
4046       } else {
4047          assert(nir_dest_bit_size(instr->dest) <= 32);
4048          assert(nir_dest_num_components(instr->dest) == 1);
4049          fs_reg read_result =
4050             emit_byte_scattered_read(bld, surf_index, offset_reg,
4051                                      1 /* dims */, 1, bit_size,
4052                                      BRW_PREDICATE_NONE);
4053          bld.MOV(dest, read_result);
4054       }
4055       break;
4056    }
4057
4058    case nir_intrinsic_store_ssbo: {
4059       assert(devinfo->gen >= 7);
4060
4061       if (stage == MESA_SHADER_FRAGMENT)
4062          brw_wm_prog_data(prog_data)->has_side_effects = true;
4063
4064       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4065       fs_reg val_reg = get_nir_src(instr->src[0]);
4066       fs_reg surf_index = get_nir_ssbo_intrinsic_index(bld, instr);
4067       fs_reg offset_reg = retype(get_nir_src(instr->src[2]),
4068                                  BRW_REGISTER_TYPE_UD);
4069
4070       val_reg.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4071
4072       assert(nir_intrinsic_write_mask(instr) ==
4073              (1 << instr->num_components) - 1);
4074       if (nir_intrinsic_align(instr) >= 4) {
4075          assert(nir_src_bit_size(instr->src[0]) == 32);
4076          assert(nir_src_num_components(instr->src[0]) <= 4);
4077          emit_untyped_write(bld, surf_index, offset_reg, val_reg,
4078                             1 /* dims */, instr->num_components,
4079                             BRW_PREDICATE_NONE);
4080       } else {
4081          assert(nir_src_bit_size(instr->src[0]) <= 32);
4082          assert(nir_src_num_components(instr->src[0]) == 1);
4083          fs_reg write_src = bld.vgrf(BRW_REGISTER_TYPE_UD);
4084          bld.MOV(write_src, val_reg);
4085          emit_byte_scattered_write(bld, surf_index, offset_reg,
4086                                    write_src, 1 /* dims */, bit_size,
4087                                    BRW_PREDICATE_NONE);
4088       }
4089       break;
4090    }
4091
4092    case nir_intrinsic_store_output: {
4093       fs_reg src = get_nir_src(instr->src[0]);
4094
4095       unsigned store_offset = nir_src_as_uint(instr->src[1]);
4096       unsigned num_components = instr->num_components;
4097       unsigned first_component = nir_intrinsic_component(instr);
4098       if (nir_src_bit_size(instr->src[0]) == 64) {
4099          src = shuffle_for_32bit_write(bld, src, 0, num_components);
4100          num_components *= 2;
4101       }
4102
4103       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4104                                       4 * store_offset), src.type);
4105       for (unsigned j = 0; j < num_components; j++) {
4106          bld.MOV(offset(new_dest, bld, j + first_component),
4107                  offset(src, bld, j));
4108       }
4109       break;
4110    }
4111
4112    case nir_intrinsic_ssbo_atomic_add:
4113       nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr);
4114       break;
4115    case nir_intrinsic_ssbo_atomic_imin:
4116       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4117       break;
4118    case nir_intrinsic_ssbo_atomic_umin:
4119       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4120       break;
4121    case nir_intrinsic_ssbo_atomic_imax:
4122       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4123       break;
4124    case nir_intrinsic_ssbo_atomic_umax:
4125       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4126       break;
4127    case nir_intrinsic_ssbo_atomic_and:
4128       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4129       break;
4130    case nir_intrinsic_ssbo_atomic_or:
4131       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4132       break;
4133    case nir_intrinsic_ssbo_atomic_xor:
4134       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4135       break;
4136    case nir_intrinsic_ssbo_atomic_exchange:
4137       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4138       break;
4139    case nir_intrinsic_ssbo_atomic_comp_swap:
4140       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4141       break;
4142    case nir_intrinsic_ssbo_atomic_fmin:
4143       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr);
4144       break;
4145    case nir_intrinsic_ssbo_atomic_fmax:
4146       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr);
4147       break;
4148    case nir_intrinsic_ssbo_atomic_fcomp_swap:
4149       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4150       break;
4151
4152    case nir_intrinsic_get_buffer_size: {
4153       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
4154                             nir_src_as_uint(instr->src[0]) : 0;
4155
4156       /* A resinfo's sampler message is used to get the buffer size.  The
4157        * SIMD8's writeback message consists of four registers and SIMD16's
4158        * writeback message consists of 8 destination registers (two per each
4159        * component).  Because we are only interested on the first channel of
4160        * the first returned component, where resinfo returns the buffer size
4161        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4162        * the dispatch width.
4163        */
4164       const fs_builder ubld = bld.exec_all().group(8, 0);
4165       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4166       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4167
4168       /* Set LOD = 0 */
4169       ubld.MOV(src_payload, brw_imm_d(0));
4170
4171       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4172       fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
4173                                 src_payload, brw_imm_ud(index));
4174       inst->header_size = 0;
4175       inst->mlen = 1;
4176       inst->size_written = 4 * REG_SIZE;
4177
4178       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
4179        *
4180        * "Out-of-bounds checking is always performed at a DWord granularity. If
4181        * any part of the DWord is out-of-bounds then the whole DWord is
4182        * considered out-of-bounds."
4183        *
4184        * This implies that types with size smaller than 4-bytes need to be
4185        * padded if they don't complete the last dword of the buffer. But as we
4186        * need to maintain the original size we need to reverse the padding
4187        * calculation to return the correct size to know the number of elements
4188        * of an unsized array. As we stored in the last two bits of the surface
4189        * size the needed padding for the buffer, we calculate here the
4190        * original buffer_size reversing the surface_size calculation:
4191        *
4192        * surface_size = isl_align(buffer_size, 4) +
4193        *                (isl_align(buffer_size) - buffer_size)
4194        *
4195        * buffer_size = surface_size & ~3 - surface_size & 3
4196        */
4197
4198       fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4199       fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4200       fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4201
4202       ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
4203       ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
4204       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
4205
4206       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
4207
4208       brw_mark_surface_used(prog_data, index);
4209       break;
4210    }
4211
4212    case nir_intrinsic_load_subgroup_invocation:
4213       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
4214               nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
4215       break;
4216
4217    case nir_intrinsic_load_subgroup_eq_mask:
4218    case nir_intrinsic_load_subgroup_ge_mask:
4219    case nir_intrinsic_load_subgroup_gt_mask:
4220    case nir_intrinsic_load_subgroup_le_mask:
4221    case nir_intrinsic_load_subgroup_lt_mask:
4222       unreachable("not reached");
4223
4224    case nir_intrinsic_vote_any: {
4225       const fs_builder ubld = bld.exec_all().group(1, 0);
4226
4227       /* The any/all predicates do not consider channel enables. To prevent
4228        * dead channels from affecting the result, we initialize the flag with
4229        * with the identity value for the logical operation.
4230        */
4231       if (dispatch_width == 32) {
4232          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4233          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4234                          brw_imm_ud(0));
4235       } else {
4236          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
4237       }
4238       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4239
4240       /* For some reason, the any/all predicates don't work properly with
4241        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4242        * doesn't read the correct subset of the flag register and you end up
4243        * getting garbage in the second half.  Work around this by using a pair
4244        * of 1-wide MOVs and scattering the result.
4245        */
4246       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4247       ubld.MOV(res1, brw_imm_d(0));
4248       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
4249                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
4250                                            BRW_PREDICATE_ALIGN1_ANY32H,
4251                     ubld.MOV(res1, brw_imm_d(-1)));
4252
4253       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4254       break;
4255    }
4256    case nir_intrinsic_vote_all: {
4257       const fs_builder ubld = bld.exec_all().group(1, 0);
4258
4259       /* The any/all predicates do not consider channel enables. To prevent
4260        * dead channels from affecting the result, we initialize the flag with
4261        * with the identity value for the logical operation.
4262        */
4263       if (dispatch_width == 32) {
4264          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4265          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4266                          brw_imm_ud(0xffffffff));
4267       } else {
4268          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4269       }
4270       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4271
4272       /* For some reason, the any/all predicates don't work properly with
4273        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4274        * doesn't read the correct subset of the flag register and you end up
4275        * getting garbage in the second half.  Work around this by using a pair
4276        * of 1-wide MOVs and scattering the result.
4277        */
4278       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4279       ubld.MOV(res1, brw_imm_d(0));
4280       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4281                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4282                                            BRW_PREDICATE_ALIGN1_ALL32H,
4283                     ubld.MOV(res1, brw_imm_d(-1)));
4284
4285       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4286       break;
4287    }
4288    case nir_intrinsic_vote_feq:
4289    case nir_intrinsic_vote_ieq: {
4290       fs_reg value = get_nir_src(instr->src[0]);
4291       if (instr->intrinsic == nir_intrinsic_vote_feq) {
4292          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4293          value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
4294       }
4295
4296       fs_reg uniformized = bld.emit_uniformize(value);
4297       const fs_builder ubld = bld.exec_all().group(1, 0);
4298
4299       /* The any/all predicates do not consider channel enables. To prevent
4300        * dead channels from affecting the result, we initialize the flag with
4301        * with the identity value for the logical operation.
4302        */
4303       if (dispatch_width == 32) {
4304          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4305          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4306                          brw_imm_ud(0xffffffff));
4307       } else {
4308          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4309       }
4310       bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
4311
4312       /* For some reason, the any/all predicates don't work properly with
4313        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4314        * doesn't read the correct subset of the flag register and you end up
4315        * getting garbage in the second half.  Work around this by using a pair
4316        * of 1-wide MOVs and scattering the result.
4317        */
4318       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4319       ubld.MOV(res1, brw_imm_d(0));
4320       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4321                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4322                                            BRW_PREDICATE_ALIGN1_ALL32H,
4323                     ubld.MOV(res1, brw_imm_d(-1)));
4324
4325       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4326       break;
4327    }
4328
4329    case nir_intrinsic_ballot: {
4330       const fs_reg value = retype(get_nir_src(instr->src[0]),
4331                                   BRW_REGISTER_TYPE_UD);
4332       struct brw_reg flag = brw_flag_reg(0, 0);
4333       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
4334        * as f0.0.  This is a problem for fragment programs as we currently use
4335        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
4336        * programs yet so this isn't a problem.  When we do, something will
4337        * have to change.
4338        */
4339       if (dispatch_width == 32)
4340          flag.type = BRW_REGISTER_TYPE_UD;
4341
4342       bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
4343       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
4344
4345       if (instr->dest.ssa.bit_size > 32) {
4346          dest.type = BRW_REGISTER_TYPE_UQ;
4347       } else {
4348          dest.type = BRW_REGISTER_TYPE_UD;
4349       }
4350       bld.MOV(dest, flag);
4351       break;
4352    }
4353
4354    case nir_intrinsic_read_invocation: {
4355       const fs_reg value = get_nir_src(instr->src[0]);
4356       const fs_reg invocation = get_nir_src(instr->src[1]);
4357       fs_reg tmp = bld.vgrf(value.type);
4358
4359       bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
4360                           bld.emit_uniformize(invocation));
4361
4362       bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
4363       break;
4364    }
4365
4366    case nir_intrinsic_read_first_invocation: {
4367       const fs_reg value = get_nir_src(instr->src[0]);
4368       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
4369       break;
4370    }
4371
4372    case nir_intrinsic_shuffle: {
4373       const fs_reg value = get_nir_src(instr->src[0]);
4374       const fs_reg index = get_nir_src(instr->src[1]);
4375
4376       bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
4377       break;
4378    }
4379
4380    case nir_intrinsic_first_invocation: {
4381       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4382       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
4383       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
4384               fs_reg(component(tmp, 0)));
4385       break;
4386    }
4387
4388    case nir_intrinsic_quad_broadcast: {
4389       const fs_reg value = get_nir_src(instr->src[0]);
4390       const unsigned index = nir_src_as_uint(instr->src[1]);
4391
4392       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
4393                value, brw_imm_ud(index), brw_imm_ud(4));
4394       break;
4395    }
4396
4397    case nir_intrinsic_quad_swap_horizontal: {
4398       const fs_reg value = get_nir_src(instr->src[0]);
4399       const fs_reg tmp = bld.vgrf(value.type);
4400       const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
4401
4402       const fs_reg src_left = horiz_stride(value, 2);
4403       const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
4404       const fs_reg tmp_left = horiz_stride(tmp, 2);
4405       const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
4406
4407       /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
4408        *
4409        *    "When source or destination datatype is 64b or operation is
4410        *    integer DWord multiply, regioning in Align1 must follow
4411        *    these rules:
4412        *
4413        *    [...]
4414        *
4415        *    3. Source and Destination offset must be the same, except
4416        *       the case of scalar source."
4417        *
4418        * In order to work around this, we have to emit two 32-bit MOVs instead
4419        * of a single 64-bit MOV to do the shuffle.
4420        */
4421       if (type_sz(value.type) > 4 &&
4422           (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
4423          ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 0),
4424                   subscript(src_right, BRW_REGISTER_TYPE_D, 0));
4425          ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 1),
4426                   subscript(src_right, BRW_REGISTER_TYPE_D, 1));
4427          ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 0),
4428                   subscript(src_left, BRW_REGISTER_TYPE_D, 0));
4429          ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 1),
4430                   subscript(src_left, BRW_REGISTER_TYPE_D, 1));
4431       } else {
4432          ubld.MOV(tmp_left, src_right);
4433          ubld.MOV(tmp_right, src_left);
4434       }
4435       bld.MOV(retype(dest, value.type), tmp);
4436       break;
4437    }
4438
4439    case nir_intrinsic_quad_swap_vertical: {
4440       const fs_reg value = get_nir_src(instr->src[0]);
4441       if (nir_src_bit_size(instr->src[0]) == 32) {
4442          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4443          const fs_reg tmp = bld.vgrf(value.type);
4444          const fs_builder ubld = bld.exec_all();
4445          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4446                    brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
4447          bld.MOV(retype(dest, value.type), tmp);
4448       } else {
4449          /* For larger data types, we have to either emit dispatch_width many
4450           * MOVs or else fall back to doing indirects.
4451           */
4452          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4453          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4454                       brw_imm_w(0x2));
4455          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4456       }
4457       break;
4458    }
4459
4460    case nir_intrinsic_quad_swap_diagonal: {
4461       const fs_reg value = get_nir_src(instr->src[0]);
4462       if (nir_src_bit_size(instr->src[0]) == 32) {
4463          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4464          const fs_reg tmp = bld.vgrf(value.type);
4465          const fs_builder ubld = bld.exec_all();
4466          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4467                    brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
4468          bld.MOV(retype(dest, value.type), tmp);
4469       } else {
4470          /* For larger data types, we have to either emit dispatch_width many
4471           * MOVs or else fall back to doing indirects.
4472           */
4473          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4474          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4475                       brw_imm_w(0x3));
4476          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4477       }
4478       break;
4479    }
4480
4481    case nir_intrinsic_reduce: {
4482       fs_reg src = get_nir_src(instr->src[0]);
4483       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4484       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
4485       if (cluster_size == 0 || cluster_size > dispatch_width)
4486          cluster_size = dispatch_width;
4487
4488       /* Figure out the source type */
4489       src.type = brw_type_for_nir_type(devinfo,
4490          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4491                         nir_src_bit_size(instr->src[0])));
4492
4493       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4494       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4495       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4496
4497       /* Set up a register for all of our scratching around and initialize it
4498        * to reduction operation's identity value.
4499        */
4500       fs_reg scan = bld.vgrf(src.type);
4501       bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4502
4503       bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
4504
4505       dest.type = src.type;
4506       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
4507          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
4508           * the distance between clusters is at least 2 GRFs.  In this case,
4509           * we don't need the weird striding of the CLUSTER_BROADCAST
4510           * instruction and can just do regular MOVs.
4511           */
4512          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
4513          const unsigned groups =
4514             (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
4515          const unsigned group_size = dispatch_width / groups;
4516          for (unsigned i = 0; i < groups; i++) {
4517             const unsigned cluster = (i * group_size) / cluster_size;
4518             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
4519             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
4520                                          component(scan, comp));
4521          }
4522       } else {
4523          bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
4524                   brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
4525       }
4526       break;
4527    }
4528
4529    case nir_intrinsic_inclusive_scan:
4530    case nir_intrinsic_exclusive_scan: {
4531       fs_reg src = get_nir_src(instr->src[0]);
4532       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4533
4534       /* Figure out the source type */
4535       src.type = brw_type_for_nir_type(devinfo,
4536          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4537                         nir_src_bit_size(instr->src[0])));
4538
4539       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4540       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4541       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4542
4543       /* Set up a register for all of our scratching around and initialize it
4544        * to reduction operation's identity value.
4545        */
4546       fs_reg scan = bld.vgrf(src.type);
4547       const fs_builder allbld = bld.exec_all();
4548       allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4549
4550       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
4551          /* Exclusive scan is a bit harder because we have to do an annoying
4552           * shift of the contents before we can begin.  To make things worse,
4553           * we can't do this with a normal stride; we have to use indirects.
4554           */
4555          fs_reg shifted = bld.vgrf(src.type);
4556          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4557          allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4558                          brw_imm_w(-1));
4559          allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
4560          allbld.group(1, 0).MOV(component(shifted, 0), identity);
4561          scan = shifted;
4562       }
4563
4564       bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
4565
4566       bld.MOV(retype(dest, src.type), scan);
4567       break;
4568    }
4569
4570    case nir_intrinsic_begin_fragment_shader_ordering:
4571    case nir_intrinsic_begin_invocation_interlock: {
4572       const fs_builder ubld = bld.group(8, 0);
4573       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4574
4575       ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
4576          REG_SIZE;
4577
4578       break;
4579    }
4580
4581    case nir_intrinsic_end_invocation_interlock: {
4582       /* We don't need to do anything here */
4583       break;
4584    }
4585
4586    default:
4587       unreachable("unknown intrinsic");
4588    }
4589 }
4590
4591 void
4592 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4593                                  int op, nir_intrinsic_instr *instr)
4594 {
4595    if (stage == MESA_SHADER_FRAGMENT)
4596       brw_wm_prog_data(prog_data)->has_side_effects = true;
4597
4598    fs_reg dest;
4599    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4600       dest = get_nir_dest(instr->dest);
4601
4602    fs_reg surface = get_nir_ssbo_intrinsic_index(bld, instr);
4603    fs_reg offset = get_nir_src(instr->src[1]);
4604    fs_reg data1;
4605    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4606       data1 = get_nir_src(instr->src[2]);
4607    fs_reg data2;
4608    if (op == BRW_AOP_CMPWR)
4609       data2 = get_nir_src(instr->src[3]);
4610
4611    /* Emit the actual atomic operation */
4612
4613    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4614                                               data1, data2,
4615                                               1 /* dims */, 1 /* rsize */,
4616                                               op,
4617                                               BRW_PREDICATE_NONE);
4618    dest.type = atomic_result.type;
4619    bld.MOV(dest, atomic_result);
4620 }
4621
4622 void
4623 fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
4624                                        int op, nir_intrinsic_instr *instr)
4625 {
4626    if (stage == MESA_SHADER_FRAGMENT)
4627       brw_wm_prog_data(prog_data)->has_side_effects = true;
4628
4629    fs_reg dest;
4630    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4631       dest = get_nir_dest(instr->dest);
4632
4633    fs_reg surface = get_nir_ssbo_intrinsic_index(bld, instr);
4634    fs_reg offset = get_nir_src(instr->src[1]);
4635    fs_reg data1 = get_nir_src(instr->src[2]);
4636    fs_reg data2;
4637    if (op == BRW_AOP_FCMPWR)
4638       data2 = get_nir_src(instr->src[3]);
4639
4640    /* Emit the actual atomic operation */
4641
4642    fs_reg atomic_result = emit_untyped_atomic_float(bld, surface, offset,
4643                                                     data1, data2,
4644                                                     1 /* dims */, 1 /* rsize */,
4645                                                     op,
4646                                                     BRW_PREDICATE_NONE);
4647    dest.type = atomic_result.type;
4648    bld.MOV(dest, atomic_result);
4649 }
4650
4651 void
4652 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4653                                    int op, nir_intrinsic_instr *instr)
4654 {
4655    fs_reg dest;
4656    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4657       dest = get_nir_dest(instr->dest);
4658
4659    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4660    fs_reg offset;
4661    fs_reg data1;
4662    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4663       data1 = get_nir_src(instr->src[1]);
4664    fs_reg data2;
4665    if (op == BRW_AOP_CMPWR)
4666       data2 = get_nir_src(instr->src[2]);
4667
4668    /* Get the offset */
4669    if (nir_src_is_const(instr->src[0])) {
4670       offset = brw_imm_ud(instr->const_index[0] +
4671                           nir_src_as_uint(instr->src[0]));
4672    } else {
4673       offset = vgrf(glsl_type::uint_type);
4674       bld.ADD(offset,
4675               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4676               brw_imm_ud(instr->const_index[0]));
4677    }
4678
4679    /* Emit the actual atomic operation operation */
4680
4681    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4682                                               data1, data2,
4683                                               1 /* dims */, 1 /* rsize */,
4684                                               op,
4685                                               BRW_PREDICATE_NONE);
4686    dest.type = atomic_result.type;
4687    bld.MOV(dest, atomic_result);
4688 }
4689
4690 void
4691 fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
4692                                          int op, nir_intrinsic_instr *instr)
4693 {
4694    fs_reg dest;
4695    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4696       dest = get_nir_dest(instr->dest);
4697
4698    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4699    fs_reg offset;
4700    fs_reg data1 = get_nir_src(instr->src[1]);
4701    fs_reg data2;
4702    if (op == BRW_AOP_FCMPWR)
4703       data2 = get_nir_src(instr->src[2]);
4704
4705    /* Get the offset */
4706    if (nir_src_is_const(instr->src[0])) {
4707       offset = brw_imm_ud(instr->const_index[0] +
4708                           nir_src_as_uint(instr->src[0]));
4709    } else {
4710       offset = vgrf(glsl_type::uint_type);
4711       bld.ADD(offset,
4712               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4713               brw_imm_ud(instr->const_index[0]));
4714    }
4715
4716    /* Emit the actual atomic operation operation */
4717
4718    fs_reg atomic_result = emit_untyped_atomic_float(bld, surface, offset,
4719                                                     data1, data2,
4720                                                     1 /* dims */, 1 /* rsize */,
4721                                                     op,
4722                                                     BRW_PREDICATE_NONE);
4723    dest.type = atomic_result.type;
4724    bld.MOV(dest, atomic_result);
4725 }
4726
4727 void
4728 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
4729 {
4730    unsigned texture = instr->texture_index;
4731    unsigned sampler = instr->sampler_index;
4732
4733    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4734
4735    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
4736    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
4737
4738    int lod_components = 0;
4739
4740    /* The hardware requires a LOD for buffer textures */
4741    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
4742       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
4743
4744    uint32_t header_bits = 0;
4745    for (unsigned i = 0; i < instr->num_srcs; i++) {
4746       fs_reg src = get_nir_src(instr->src[i].src);
4747       switch (instr->src[i].src_type) {
4748       case nir_tex_src_bias:
4749          srcs[TEX_LOGICAL_SRC_LOD] =
4750             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4751          break;
4752       case nir_tex_src_comparator:
4753          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
4754          break;
4755       case nir_tex_src_coord:
4756          switch (instr->op) {
4757          case nir_texop_txf:
4758          case nir_texop_txf_ms:
4759          case nir_texop_txf_ms_mcs:
4760          case nir_texop_samples_identical:
4761             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
4762             break;
4763          default:
4764             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
4765             break;
4766          }
4767          break;
4768       case nir_tex_src_ddx:
4769          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
4770          lod_components = nir_tex_instr_src_size(instr, i);
4771          break;
4772       case nir_tex_src_ddy:
4773          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
4774          break;
4775       case nir_tex_src_lod:
4776          switch (instr->op) {
4777          case nir_texop_txs:
4778             srcs[TEX_LOGICAL_SRC_LOD] =
4779                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
4780             break;
4781          case nir_texop_txf:
4782             srcs[TEX_LOGICAL_SRC_LOD] =
4783                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
4784             break;
4785          default:
4786             srcs[TEX_LOGICAL_SRC_LOD] =
4787                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4788             break;
4789          }
4790          break;
4791       case nir_tex_src_ms_index:
4792          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
4793          break;
4794
4795       case nir_tex_src_offset: {
4796          nir_const_value *const_offset =
4797             nir_src_as_const_value(instr->src[i].src);
4798          assert(nir_src_bit_size(instr->src[i].src) == 32);
4799          unsigned offset_bits = 0;
4800          if (const_offset &&
4801              brw_texture_offset(const_offset->i32,
4802                                 nir_tex_instr_src_size(instr, i),
4803                                 &offset_bits)) {
4804             header_bits |= offset_bits;
4805          } else {
4806             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
4807                retype(src, BRW_REGISTER_TYPE_D);
4808          }
4809          break;
4810       }
4811
4812       case nir_tex_src_projector:
4813          unreachable("should be lowered");
4814
4815       case nir_tex_src_texture_offset: {
4816          /* Figure out the highest possible texture index and mark it as used */
4817          uint32_t max_used = texture + instr->texture_array_size - 1;
4818          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
4819             max_used += stage_prog_data->binding_table.gather_texture_start;
4820          } else {
4821             max_used += stage_prog_data->binding_table.texture_start;
4822          }
4823          brw_mark_surface_used(prog_data, max_used);
4824
4825          /* Emit code to evaluate the actual indexing expression */
4826          fs_reg tmp = vgrf(glsl_type::uint_type);
4827          bld.ADD(tmp, src, brw_imm_ud(texture));
4828          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
4829          break;
4830       }
4831
4832       case nir_tex_src_sampler_offset: {
4833          /* Emit code to evaluate the actual indexing expression */
4834          fs_reg tmp = vgrf(glsl_type::uint_type);
4835          bld.ADD(tmp, src, brw_imm_ud(sampler));
4836          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
4837          break;
4838       }
4839
4840       case nir_tex_src_ms_mcs:
4841          assert(instr->op == nir_texop_txf_ms);
4842          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
4843          break;
4844
4845       case nir_tex_src_plane: {
4846          const uint32_t plane = nir_src_as_uint(instr->src[i].src);
4847          const uint32_t texture_index =
4848             instr->texture_index +
4849             stage_prog_data->binding_table.plane_start[plane] -
4850             stage_prog_data->binding_table.texture_start;
4851
4852          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
4853          break;
4854       }
4855
4856       default:
4857          unreachable("unknown texture source");
4858       }
4859    }
4860
4861    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
4862        (instr->op == nir_texop_txf_ms ||
4863         instr->op == nir_texop_samples_identical)) {
4864       if (devinfo->gen >= 7 &&
4865           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
4866          srcs[TEX_LOGICAL_SRC_MCS] =
4867             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
4868                            instr->coord_components,
4869                            srcs[TEX_LOGICAL_SRC_SURFACE]);
4870       } else {
4871          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
4872       }
4873    }
4874
4875    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
4876    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
4877
4878    enum opcode opcode;
4879    switch (instr->op) {
4880    case nir_texop_tex:
4881       opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
4882                 SHADER_OPCODE_TXL_LOGICAL);
4883       break;
4884    case nir_texop_txb:
4885       opcode = FS_OPCODE_TXB_LOGICAL;
4886       break;
4887    case nir_texop_txl:
4888       opcode = SHADER_OPCODE_TXL_LOGICAL;
4889       break;
4890    case nir_texop_txd:
4891       opcode = SHADER_OPCODE_TXD_LOGICAL;
4892       break;
4893    case nir_texop_txf:
4894       opcode = SHADER_OPCODE_TXF_LOGICAL;
4895       break;
4896    case nir_texop_txf_ms:
4897       if ((key_tex->msaa_16 & (1 << sampler)))
4898          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
4899       else
4900          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
4901       break;
4902    case nir_texop_txf_ms_mcs:
4903       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
4904       break;
4905    case nir_texop_query_levels:
4906    case nir_texop_txs:
4907       opcode = SHADER_OPCODE_TXS_LOGICAL;
4908       break;
4909    case nir_texop_lod:
4910       opcode = SHADER_OPCODE_LOD_LOGICAL;
4911       break;
4912    case nir_texop_tg4:
4913       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
4914          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
4915       else
4916          opcode = SHADER_OPCODE_TG4_LOGICAL;
4917       break;
4918    case nir_texop_texture_samples:
4919       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
4920       break;
4921    case nir_texop_samples_identical: {
4922       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
4923
4924       /* If mcs is an immediate value, it means there is no MCS.  In that case
4925        * just return false.
4926        */
4927       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
4928          bld.MOV(dst, brw_imm_ud(0u));
4929       } else if ((key_tex->msaa_16 & (1 << sampler))) {
4930          fs_reg tmp = vgrf(glsl_type::uint_type);
4931          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
4932                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
4933          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
4934       } else {
4935          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
4936                  BRW_CONDITIONAL_EQ);
4937       }
4938       return;
4939    }
4940    default:
4941       unreachable("unknown texture opcode");
4942    }
4943
4944    if (instr->op == nir_texop_tg4) {
4945       if (instr->component == 1 &&
4946           key_tex->gather_channel_quirk_mask & (1 << texture)) {
4947          /* gather4 sampler is broken for green channel on RG32F --
4948           * we must ask for blue instead.
4949           */
4950          header_bits |= 2 << 16;
4951       } else {
4952          header_bits |= instr->component << 16;
4953       }
4954    }
4955
4956    fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
4957    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
4958    inst->offset = header_bits;
4959
4960    const unsigned dest_size = nir_tex_instr_dest_size(instr);
4961    if (devinfo->gen >= 9 &&
4962        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
4963       unsigned write_mask = instr->dest.is_ssa ?
4964                             nir_ssa_def_components_read(&instr->dest.ssa):
4965                             (1 << dest_size) - 1;
4966       assert(write_mask != 0); /* dead code should have been eliminated */
4967       inst->size_written = util_last_bit(write_mask) *
4968                            inst->dst.component_size(inst->exec_size);
4969    } else {
4970       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
4971    }
4972
4973    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
4974       inst->shadow_compare = true;
4975
4976    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
4977       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
4978
4979    fs_reg nir_dest[4];
4980    for (unsigned i = 0; i < dest_size; i++)
4981       nir_dest[i] = offset(dst, bld, i);
4982
4983    if (instr->op == nir_texop_query_levels) {
4984       /* # levels is in .w */
4985       nir_dest[0] = offset(dst, bld, 3);
4986    } else if (instr->op == nir_texop_txs &&
4987               dest_size >= 3 && devinfo->gen < 7) {
4988       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
4989       fs_reg depth = offset(dst, bld, 2);
4990       nir_dest[2] = vgrf(glsl_type::int_type);
4991       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
4992    }
4993
4994    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
4995 }
4996
4997 void
4998 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
4999 {
5000    switch (instr->type) {
5001    case nir_jump_break:
5002       bld.emit(BRW_OPCODE_BREAK);
5003       break;
5004    case nir_jump_continue:
5005       bld.emit(BRW_OPCODE_CONTINUE);
5006       break;
5007    case nir_jump_return:
5008    default:
5009       unreachable("unknown jump");
5010    }
5011 }
5012
5013 /*
5014  * This helper takes a source register and un/shuffles it into the destination
5015  * register.
5016  *
5017  * If source type size is smaller than destination type size the operation
5018  * needed is a component shuffle. The opposite case would be an unshuffle. If
5019  * source/destination type size is equal a shuffle is done that would be
5020  * equivalent to a simple MOV.
5021  *
5022  * For example, if source is a 16-bit type and destination is 32-bit. A 3
5023  * components .xyz 16-bit vector on SIMD8 would be.
5024  *
5025  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
5026  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
5027  *
5028  * This helper will return the following 2 32-bit components with the 16-bit
5029  * values shuffled:
5030  *
5031  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
5032  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
5033  *
5034  * For unshuffle, the example would be the opposite, a 64-bit type source
5035  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
5036  * would be:
5037  *
5038  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
5039  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
5040  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
5041  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
5042  *
5043  * The returned result would be the following 4 32-bit components unshuffled:
5044  *
5045  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
5046  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
5047  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
5048  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
5049  *
5050  * - Source and destination register must not be overlapped.
5051  * - components units are measured in terms of the smaller type between
5052  *   source and destination because we are un/shuffling the smaller
5053  *   components from/into the bigger ones.
5054  * - first_component parameter allows skipping source components.
5055  */
5056 void
5057 shuffle_src_to_dst(const fs_builder &bld,
5058                    const fs_reg &dst,
5059                    const fs_reg &src,
5060                    uint32_t first_component,
5061                    uint32_t components)
5062 {
5063    if (type_sz(src.type) == type_sz(dst.type)) {
5064       assert(!regions_overlap(dst,
5065          type_sz(dst.type) * bld.dispatch_width() * components,
5066          offset(src, bld, first_component),
5067          type_sz(src.type) * bld.dispatch_width() * components));
5068       for (unsigned i = 0; i < components; i++) {
5069          bld.MOV(retype(offset(dst, bld, i), src.type),
5070                  offset(src, bld, i + first_component));
5071       }
5072    } else if (type_sz(src.type) < type_sz(dst.type)) {
5073       /* Source is shuffled into destination */
5074       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
5075       assert(!regions_overlap(dst,
5076          type_sz(dst.type) * bld.dispatch_width() *
5077          DIV_ROUND_UP(components, size_ratio),
5078          offset(src, bld, first_component),
5079          type_sz(src.type) * bld.dispatch_width() * components));
5080
5081       brw_reg_type shuffle_type =
5082          brw_reg_type_from_bit_size(8 * type_sz(src.type),
5083                                     BRW_REGISTER_TYPE_D);
5084       for (unsigned i = 0; i < components; i++) {
5085          fs_reg shuffle_component_i =
5086             subscript(offset(dst, bld, i / size_ratio),
5087                       shuffle_type, i % size_ratio);
5088          bld.MOV(shuffle_component_i,
5089                  retype(offset(src, bld, i + first_component), shuffle_type));
5090       }
5091    } else {
5092       /* Source is unshuffled into destination */
5093       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
5094       assert(!regions_overlap(dst,
5095          type_sz(dst.type) * bld.dispatch_width() * components,
5096          offset(src, bld, first_component / size_ratio),
5097          type_sz(src.type) * bld.dispatch_width() *
5098          DIV_ROUND_UP(components + (first_component % size_ratio),
5099                       size_ratio)));
5100
5101       brw_reg_type shuffle_type =
5102          brw_reg_type_from_bit_size(8 * type_sz(dst.type),
5103                                     BRW_REGISTER_TYPE_D);
5104       for (unsigned i = 0; i < components; i++) {
5105          fs_reg shuffle_component_i =
5106             subscript(offset(src, bld, (first_component + i) / size_ratio),
5107                       shuffle_type, (first_component + i) % size_ratio);
5108          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
5109                  shuffle_component_i);
5110       }
5111    }
5112 }
5113
5114 void
5115 shuffle_from_32bit_read(const fs_builder &bld,
5116                         const fs_reg &dst,
5117                         const fs_reg &src,
5118                         uint32_t first_component,
5119                         uint32_t components)
5120 {
5121    assert(type_sz(src.type) == 4);
5122
5123    /* This function takes components in units of the destination type while
5124     * shuffle_src_to_dst takes components in units of the smallest type
5125     */
5126    if (type_sz(dst.type) > 4) {
5127       assert(type_sz(dst.type) == 8);
5128       first_component *= 2;
5129       components *= 2;
5130    }
5131
5132    shuffle_src_to_dst(bld, dst, src, first_component, components);
5133 }
5134
5135 fs_reg
5136 shuffle_for_32bit_write(const fs_builder &bld,
5137                         const fs_reg &src,
5138                         uint32_t first_component,
5139                         uint32_t components)
5140 {
5141    fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
5142                          DIV_ROUND_UP (components * type_sz(src.type), 4));
5143    /* This function takes components in units of the source type while
5144     * shuffle_src_to_dst takes components in units of the smallest type
5145     */
5146    if (type_sz(src.type) > 4) {
5147       assert(type_sz(src.type) == 8);
5148       first_component *= 2;
5149       components *= 2;
5150    }
5151
5152    shuffle_src_to_dst(bld, dst, src, first_component, components);
5153
5154    return dst;
5155 }
5156
5157 fs_reg
5158 setup_imm_df(const fs_builder &bld, double v)
5159 {
5160    const struct gen_device_info *devinfo = bld.shader->devinfo;
5161    assert(devinfo->gen >= 7);
5162
5163    if (devinfo->gen >= 8)
5164       return brw_imm_df(v);
5165
5166    /* gen7.5 does not support DF immediates straighforward but the DIM
5167     * instruction allows to set the 64-bit immediate value.
5168     */
5169    if (devinfo->is_haswell) {
5170       const fs_builder ubld = bld.exec_all().group(1, 0);
5171       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
5172       ubld.DIM(dst, brw_imm_df(v));
5173       return component(dst, 0);
5174    }
5175
5176    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
5177     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
5178     * the high 32-bit to suboffset 4 and then applying a stride of 0.
5179     *
5180     * Alternatively, we could also produce a normal VGRF (without stride 0)
5181     * by writing to all the channels in the VGRF, however, that would hit the
5182     * gen7 bug where we have to split writes that span more than 1 register
5183     * into instructions with a width of 4 (otherwise the write to the second
5184     * register written runs into an execmask hardware bug) which isn't very
5185     * nice.
5186     */
5187    union {
5188       double d;
5189       struct {
5190          uint32_t i1;
5191          uint32_t i2;
5192       };
5193    } di;
5194
5195    di.d = v;
5196
5197    const fs_builder ubld = bld.exec_all().group(1, 0);
5198    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5199    ubld.MOV(tmp, brw_imm_ud(di.i1));
5200    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
5201
5202    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
5203 }
5204
5205 fs_reg
5206 setup_imm_b(const fs_builder &bld, int8_t v)
5207 {
5208    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
5209    bld.MOV(tmp, brw_imm_w(v));
5210    return tmp;
5211 }
5212
5213 fs_reg
5214 setup_imm_ub(const fs_builder &bld, uint8_t v)
5215 {
5216    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
5217    bld.MOV(tmp, brw_imm_uw(v));
5218    return tmp;
5219 }