src/intel/compiler/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "compiler/glsl/ir.h"
  25 #include "brw_fs.h"
  26 #include "brw_nir.h"
  27 #include "util/u_math.h"
  28 #include "util/bitscan.h"
  29
  30 using namespace brw;
  31
  32 void
  33 fs_visitor::emit_nir_code()
  34 {
  35    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  36     * be converted to reads/writes of these arrays
  37     */
  38    nir_setup_outputs();
  39    nir_setup_uniforms();
  40    nir_emit_system_values();
  41
  42    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
  43 }
  44
  45 void
  46 fs_visitor::nir_setup_outputs()
  47 {
  48    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
  49       return;
  50
  51    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
  52
  53    /* Calculate the size of output registers in a separate pass, before
  54     * allocating them.  With ARB_enhanced_layouts, multiple output variables
  55     * may occupy the same slot, but have different type sizes.
  56     */
  57    nir_foreach_variable(var, &nir->outputs) {
  58       const int loc = var->data.driver_location;
  59       const unsigned var_vec4s =
  60          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
  61                            : type_size_vec4(var->type, true);
  62       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
  63    }
  64
  65    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
  66       if (vec4s[loc] == 0) {
  67          loc++;
  68          continue;
  69       }
  70
  71       unsigned reg_size = vec4s[loc];
  72
  73       /* Check if there are any ranges that start within this range and extend
  74        * past it. If so, include them in this allocation.
  75        */
  76       for (unsigned i = 1; i < reg_size; i++)
  77          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
  78
  79       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
  80       for (unsigned i = 0; i < reg_size; i++)
  81          outputs[loc + i] = offset(reg, bld, 4 * i);
  82
  83       loc += reg_size;
  84    }
  85 }
  86
  87 void
  88 fs_visitor::nir_setup_uniforms()
  89 {
  90    /* Only the first compile gets to set up uniforms. */
  91    if (push_constant_loc) {
  92       assert(pull_constant_loc);
  93       return;
  94    }
  95
  96    uniforms = nir->num_uniforms / 4;
  97
  98    if (stage == MESA_SHADER_COMPUTE) {
  99       /* Add a uniform for the thread local id.  It must be the last uniform
 100        * on the list.
 101        */
 102       assert(uniforms == prog_data->nr_params);
 103       uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
 104       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
 105       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
 106    }
 107 }
 108
 109 static bool
 110 emit_system_values_block(nir_block *block, fs_visitor *v)
 111 {
 112    fs_reg *reg;
 113
 114    nir_foreach_instr(instr, block) {
 115       if (instr->type != nir_instr_type_intrinsic)
 116          continue;
 117
 118       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 119       switch (intrin->intrinsic) {
 120       case nir_intrinsic_load_vertex_id:
 121       case nir_intrinsic_load_base_vertex:
 122          unreachable("should be lowered by nir_lower_system_values().");
 123
 124       case nir_intrinsic_load_vertex_id_zero_base:
 125       case nir_intrinsic_load_is_indexed_draw:
 126       case nir_intrinsic_load_first_vertex:
 127       case nir_intrinsic_load_instance_id:
 128       case nir_intrinsic_load_base_instance:
 129       case nir_intrinsic_load_draw_id:
 130          unreachable("should be lowered by brw_nir_lower_vs_inputs().");
 131
 132       case nir_intrinsic_load_invocation_id:
 133          if (v->stage == MESA_SHADER_TESS_CTRL)
 134             break;
 135          assert(v->stage == MESA_SHADER_GEOMETRY);
 136          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 137          if (reg->file == BAD_FILE) {
 138             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 139             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 140             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 141             abld.SHR(iid, g1, brw_imm_ud(27u));
 142             *reg = iid;
 143          }
 144          break;
 145
 146       case nir_intrinsic_load_sample_pos:
 147          assert(v->stage == MESA_SHADER_FRAGMENT);
 148          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 149          if (reg->file == BAD_FILE)
 150             *reg = *v->emit_samplepos_setup();
 151          break;
 152
 153       case nir_intrinsic_load_sample_id:
 154          assert(v->stage == MESA_SHADER_FRAGMENT);
 155          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 156          if (reg->file == BAD_FILE)
 157             *reg = *v->emit_sampleid_setup();
 158          break;
 159
 160       case nir_intrinsic_load_sample_mask_in:
 161          assert(v->stage == MESA_SHADER_FRAGMENT);
 162          assert(v->devinfo->gen >= 7);
 163          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 164          if (reg->file == BAD_FILE)
 165             *reg = *v->emit_samplemaskin_setup();
 166          break;
 167
 168       case nir_intrinsic_load_work_group_id:
 169          assert(v->stage == MESA_SHADER_COMPUTE);
 170          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 171          if (reg->file == BAD_FILE)
 172             *reg = *v->emit_cs_work_group_id_setup();
 173          break;
 174
 175       case nir_intrinsic_load_helper_invocation:
 176          assert(v->stage == MESA_SHADER_FRAGMENT);
 177          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
 178          if (reg->file == BAD_FILE) {
 179             const fs_builder abld =
 180                v->bld.annotate("gl_HelperInvocation", NULL);
 181
 182             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
 183              * pixel mask is in g1.7 of the thread payload.
 184              *
 185              * We move the per-channel pixel enable bit to the low bit of each
 186              * channel by shifting the byte containing the pixel mask by the
 187              * vector immediate 0x76543210UV.
 188              *
 189              * The region of <1,8,0> reads only 1 byte (the pixel masks for
 190              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
 191              * masks for 2 and 3) in SIMD16.
 192              */
 193             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
 194
 195             for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
 196                const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
 197                hbld.SHR(offset(shifted, hbld, i),
 198                         stride(retype(brw_vec1_grf(1 + i, 7),
 199                                       BRW_REGISTER_TYPE_UB),
 200                                1, 8, 0),
 201                         brw_imm_v(0x76543210));
 202             }
 203
 204             /* A set bit in the pixel mask means the channel is enabled, but
 205              * that is the opposite of gl_HelperInvocation so we need to invert
 206              * the mask.
 207              *
 208              * The negate source-modifier bit of logical instructions on Gen8+
 209              * performs 1's complement negation, so we can use that instead of
 210              * a NOT instruction.
 211              */
 212             fs_reg inverted = negate(shifted);
 213             if (v->devinfo->gen < 8) {
 214                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
 215                abld.NOT(inverted, shifted);
 216             }
 217
 218             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
 219              * with 1 and negating.
 220              */
 221             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 222             abld.AND(anded, inverted, brw_imm_uw(1));
 223
 224             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
 225             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
 226             *reg = dst;
 227          }
 228          break;
 229
 230       default:
 231          break;
 232       }
 233    }
 234
 235    return true;
 236 }
 237
 238 void
 239 fs_visitor::nir_emit_system_values()
 240 {
 241    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 242    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
 243       nir_system_values[i] = fs_reg();
 244    }
 245
 246    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
 247     * never end up using it.
 248     */
 249    {
 250       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
 251       fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
 252       reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
 253
 254       const fs_builder allbld8 = abld.group(8, 0).exec_all();
 255       allbld8.MOV(reg, brw_imm_v(0x76543210));
 256       if (dispatch_width > 8)
 257          allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
 258       if (dispatch_width > 16) {
 259          const fs_builder allbld16 = abld.group(16, 0).exec_all();
 260          allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
 261       }
 262    }
 263
 264    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
 265    nir_foreach_block(block, impl)
 266       emit_system_values_block(block, this);
 267 }
 268
 269 /*
 270  * Returns a type based on a reference_type (word, float, half-float) and a
 271  * given bit_size.
 272  *
 273  * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
 274  *
 275  * @FIXME: 64-bit return types are always DF on integer types to maintain
 276  * compability with uses of DF previously to the introduction of int64
 277  * support.
 278  */
 279 static brw_reg_type
 280 brw_reg_type_from_bit_size(const unsigned bit_size,
 281                            const brw_reg_type reference_type)
 282 {
 283    switch(reference_type) {
 284    case BRW_REGISTER_TYPE_HF:
 285    case BRW_REGISTER_TYPE_F:
 286    case BRW_REGISTER_TYPE_DF:
 287       switch(bit_size) {
 288       case 16:
 289          return BRW_REGISTER_TYPE_HF;
 290       case 32:
 291          return BRW_REGISTER_TYPE_F;
 292       case 64:
 293          return BRW_REGISTER_TYPE_DF;
 294       default:
 295          unreachable("Invalid bit size");
 296       }
 297    case BRW_REGISTER_TYPE_B:
 298    case BRW_REGISTER_TYPE_W:
 299    case BRW_REGISTER_TYPE_D:
 300    case BRW_REGISTER_TYPE_Q:
 301       switch(bit_size) {
 302       case 8:
 303          return BRW_REGISTER_TYPE_B;
 304       case 16:
 305          return BRW_REGISTER_TYPE_W;
 306       case 32:
 307          return BRW_REGISTER_TYPE_D;
 308       case 64:
 309          return BRW_REGISTER_TYPE_Q;
 310       default:
 311          unreachable("Invalid bit size");
 312       }
 313    case BRW_REGISTER_TYPE_UB:
 314    case BRW_REGISTER_TYPE_UW:
 315    case BRW_REGISTER_TYPE_UD:
 316    case BRW_REGISTER_TYPE_UQ:
 317       switch(bit_size) {
 318       case 8:
 319          return BRW_REGISTER_TYPE_UB;
 320       case 16:
 321          return BRW_REGISTER_TYPE_UW;
 322       case 32:
 323          return BRW_REGISTER_TYPE_UD;
 324       case 64:
 325          return BRW_REGISTER_TYPE_UQ;
 326       default:
 327          unreachable("Invalid bit size");
 328       }
 329    default:
 330       unreachable("Unknown type");
 331    }
 332 }
 333
 334 void
 335 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 336 {
 337    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
 338    for (unsigned i = 0; i < impl->reg_alloc; i++) {
 339       nir_locals[i] = fs_reg();
 340    }
 341
 342    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 343       unsigned array_elems =
 344          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 345       unsigned size = array_elems * reg->num_components;
 346       const brw_reg_type reg_type =
 347          brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
 348       nir_locals[reg->index] = bld.vgrf(reg_type, size);
 349    }
 350
 351    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 352                              impl->ssa_alloc);
 353
 354    nir_emit_cf_list(&impl->body);
 355 }
 356
 357 void
 358 fs_visitor::nir_emit_cf_list(exec_list *list)
 359 {
 360    exec_list_validate(list);
 361    foreach_list_typed(nir_cf_node, node, node, list) {
 362       switch (node->type) {
 363       case nir_cf_node_if:
 364          nir_emit_if(nir_cf_node_as_if(node));
 365          break;
 366
 367       case nir_cf_node_loop:
 368          nir_emit_loop(nir_cf_node_as_loop(node));
 369          break;
 370
 371       case nir_cf_node_block:
 372          nir_emit_block(nir_cf_node_as_block(node));
 373          break;
 374
 375       default:
 376          unreachable("Invalid CFG node block");
 377       }
 378    }
 379 }
 380
 381 void
 382 fs_visitor::nir_emit_if(nir_if *if_stmt)
 383 {
 384    bool invert;
 385    fs_reg cond_reg;
 386
 387    /* If the condition has the form !other_condition, use other_condition as
 388     * the source, but invert the predicate on the if instruction.
 389     */
 390    nir_alu_instr *const cond = nir_src_as_alu_instr(&if_stmt->condition);
 391    if (cond != NULL && cond->op == nir_op_inot) {
 392       assert(!cond->src[0].negate);
 393       assert(!cond->src[0].abs);
 394
 395       invert = true;
 396       cond_reg = get_nir_src(cond->src[0].src);
 397    } else {
 398       invert = false;
 399       cond_reg = get_nir_src(if_stmt->condition);
 400    }
 401
 402    /* first, put the condition into f0 */
 403    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 404                            retype(cond_reg, BRW_REGISTER_TYPE_D));
 405    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 406
 407    bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
 408
 409    nir_emit_cf_list(&if_stmt->then_list);
 410
 411    /* note: if the else is empty, dead CF elimination will remove it */
 412    bld.emit(BRW_OPCODE_ELSE);
 413
 414    nir_emit_cf_list(&if_stmt->else_list);
 415
 416    bld.emit(BRW_OPCODE_ENDIF);
 417
 418    if (devinfo->gen < 7)
 419       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 420                            "in SIMD32 mode.");
 421 }
 422
 423 void
 424 fs_visitor::nir_emit_loop(nir_loop *loop)
 425 {
 426    bld.emit(BRW_OPCODE_DO);
 427
 428    nir_emit_cf_list(&loop->body);
 429
 430    bld.emit(BRW_OPCODE_WHILE);
 431
 432    if (devinfo->gen < 7)
 433       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 434                            "in SIMD32 mode.");
 435 }
 436
 437 void
 438 fs_visitor::nir_emit_block(nir_block *block)
 439 {
 440    nir_foreach_instr(instr, block) {
 441       nir_emit_instr(instr);
 442    }
 443 }
 444
 445 void
 446 fs_visitor::nir_emit_instr(nir_instr *instr)
 447 {
 448    const fs_builder abld = bld.annotate(NULL, instr);
 449
 450    switch (instr->type) {
 451    case nir_instr_type_alu:
 452       nir_emit_alu(abld, nir_instr_as_alu(instr));
 453       break;
 454
 455    case nir_instr_type_deref:
 456       /* Derefs can exist for images but they do nothing */
 457       break;
 458
 459    case nir_instr_type_intrinsic:
 460       switch (stage) {
 461       case MESA_SHADER_VERTEX:
 462          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 463          break;
 464       case MESA_SHADER_TESS_CTRL:
 465          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 466          break;
 467       case MESA_SHADER_TESS_EVAL:
 468          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
 469          break;
 470       case MESA_SHADER_GEOMETRY:
 471          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 472          break;
 473       case MESA_SHADER_FRAGMENT:
 474          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 475          break;
 476       case MESA_SHADER_COMPUTE:
 477          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 478          break;
 479       default:
 480          unreachable("unsupported shader stage");
 481       }
 482       break;
 483
 484    case nir_instr_type_tex:
 485       nir_emit_texture(abld, nir_instr_as_tex(instr));
 486       break;
 487
 488    case nir_instr_type_load_const:
 489       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 490       break;
 491
 492    case nir_instr_type_ssa_undef:
 493       /* We create a new VGRF for undefs on every use (by handling
 494        * them in get_nir_src()), rather than for each definition.
 495        * This helps register coalescing eliminate MOVs from undef.
 496        */
 497       break;
 498
 499    case nir_instr_type_jump:
 500       nir_emit_jump(abld, nir_instr_as_jump(instr));
 501       break;
 502
 503    default:
 504       unreachable("unknown instruction type");
 505    }
 506 }
 507
 508 /**
 509  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
 510  * match instr.
 511  */
 512 bool
 513 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 514                                       const fs_reg &result)
 515 {
 516    if (!instr->src[0].src.is_ssa ||
 517        !instr->src[0].src.ssa->parent_instr)
 518       return false;
 519
 520    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
 521       return false;
 522
 523    nir_alu_instr *src0 =
 524       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
 525
 526    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
 527        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
 528       return false;
 529
 530    /* If either opcode has source modifiers, bail.
 531     *
 532     * TODO: We can potentially handle source modifiers if both of the opcodes
 533     * we're combining are signed integers.
 534     */
 535    if (instr->src[0].abs || instr->src[0].negate ||
 536        src0->src[0].abs || src0->src[0].negate)
 537       return false;
 538
 539    unsigned element = nir_src_as_uint(src0->src[1].src);
 540
 541    /* Element type to extract.*/
 542    const brw_reg_type type = brw_int_type(
 543       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
 544       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
 545
 546    fs_reg op0 = get_nir_src(src0->src[0].src);
 547    op0.type = brw_type_for_nir_type(devinfo,
 548       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
 549                      nir_src_bit_size(src0->src[0].src)));
 550    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 551
 552    set_saturate(instr->dest.saturate,
 553                 bld.MOV(result, subscript(op0, type, element)));
 554    return true;
 555 }
 556
 557 bool
 558 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 559                                          const fs_reg &result)
 560 {
 561    if (!instr->src[0].src.is_ssa ||
 562        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
 563       return false;
 564
 565    nir_intrinsic_instr *src0 =
 566       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 567
 568    if (src0->intrinsic != nir_intrinsic_load_front_face)
 569       return false;
 570
 571    if (!nir_src_is_const(instr->src[1].src) ||
 572        !nir_src_is_const(instr->src[2].src))
 573       return false;
 574
 575    const float value1 = nir_src_as_float(instr->src[1].src);
 576    const float value2 = nir_src_as_float(instr->src[2].src);
 577    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
 578       return false;
 579
 580    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
 581    assert(value1 == -value2);
 582
 583    fs_reg tmp = vgrf(glsl_type::int_type);
 584
 585    if (devinfo->gen >= 6) {
 586       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 587       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 588
 589       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 590        *
 591        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 592        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 593        *
 594        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 595        *
 596        * This negation looks like it's safe in practice, because bits 0:4 will
 597        * surely be TRIANGLES
 598        */
 599
 600       if (value1 == -1.0f) {
 601          g0.negate = true;
 602       }
 603
 604       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
 605              g0, brw_imm_uw(0x3f80));
 606    } else {
 607       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 608       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 609
 610       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 611        *
 612        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 613        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 614        *
 615        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 616        *
 617        * This negation looks like it's safe in practice, because bits 0:4 will
 618        * surely be TRIANGLES
 619        */
 620
 621       if (value1 == -1.0f) {
 622          g1_6.negate = true;
 623       }
 624
 625       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
 626    }
 627    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 628
 629    return true;
 630 }
 631
 632 static void
 633 emit_find_msb_using_lzd(const fs_builder &bld,
 634                         const fs_reg &result,
 635                         const fs_reg &src,
 636                         bool is_signed)
 637 {
 638    fs_inst *inst;
 639    fs_reg temp = src;
 640
 641    if (is_signed) {
 642       /* LZD of an absolute value source almost always does the right
 643        * thing.  There are two problem values:
 644        *
 645        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
 646        *   0.  However, findMSB(int(0x80000000)) == 30.
 647        *
 648        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
 649        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
 650        *
 651        *    For a value of zero or negative one, -1 will be returned.
 652        *
 653        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
 654        *   findMSB(-(1<<x)) should return x-1.
 655        *
 656        * For all negative number cases, including 0x80000000 and
 657        * 0xffffffff, the correct value is obtained from LZD if instead of
 658        * negating the (already negative) value the logical-not is used.  A
 659        * conditonal logical-not can be achieved in two instructions.
 660        */
 661       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
 662
 663       bld.ASR(temp, src, brw_imm_d(31));
 664       bld.XOR(temp, temp, src);
 665    }
 666
 667    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
 668            retype(temp, BRW_REGISTER_TYPE_UD));
 669
 670    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
 671     * from the LSB side. Subtract the result from 31 to convert the MSB
 672     * count into an LSB count.  If no bits are set, LZD will return 32.
 673     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
 674     */
 675    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
 676    inst->src[0].negate = true;
 677 }
 678
 679 static brw_rnd_mode
 680 brw_rnd_mode_from_nir_op (const nir_op op) {
 681    switch (op) {
 682    case nir_op_f2f16_rtz:
 683       return BRW_RND_MODE_RTZ;
 684    case nir_op_f2f16_rtne:
 685       return BRW_RND_MODE_RTNE;
 686    default:
 687       unreachable("Operation doesn't support rounding mode");
 688    }
 689 }
 690
 691 fs_reg
 692 fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
 693                                                 nir_alu_instr *instr,
 694                                                 fs_reg *op,
 695                                                 bool need_dest)
 696 {
 697    fs_reg result =
 698       need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();
 699
 700    result.type = brw_type_for_nir_type(devinfo,
 701       (nir_alu_type)(nir_op_infos[instr->op].output_type |
 702                      nir_dest_bit_size(instr->dest.dest)));
 703
 704    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 705       op[i] = get_nir_src(instr->src[i].src);
 706       op[i].type = brw_type_for_nir_type(devinfo,
 707          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
 708                         nir_src_bit_size(instr->src[i].src)));
 709       op[i].abs = instr->src[i].abs;
 710       op[i].negate = instr->src[i].negate;
 711    }
 712
 713    /* Move and vecN instrutions may still be vectored.  Return the raw,
 714     * vectored source and destination so that fs_visitor::nir_emit_alu can
 715     * handle it.  Other callers should not have to handle these kinds of
 716     * instructions.
 717     */
 718    switch (instr->op) {
 719    case nir_op_imov:
 720    case nir_op_fmov:
 721    case nir_op_vec2:
 722    case nir_op_vec3:
 723    case nir_op_vec4:
 724       return result;
 725    default:
 726       break;
 727    }
 728
 729    /* At this point, we have dealt with any instruction that operates on
 730     * more than a single channel.  Therefore, we can just adjust the source
 731     * and destination registers for that channel and emit the instruction.
 732     */
 733    unsigned channel = 0;
 734    if (nir_op_infos[instr->op].output_size == 0) {
 735       /* Since NIR is doing the scalarizing for us, we should only ever see
 736        * vectorized operations with a single channel.
 737        */
 738       assert(util_bitcount(instr->dest.write_mask) == 1);
 739       channel = ffs(instr->dest.write_mask) - 1;
 740
 741       result = offset(result, bld, channel);
 742    }
 743
 744    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 745       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 746       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 747    }
 748
 749    return result;
 750 }
 751
 752 void
 753 fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
 754                                  fs_reg *op)
 755 {
 756    for (unsigned i = 0; i < 2; i++) {
 757       nir_alu_instr *const inot_instr =
 758          nir_src_as_alu_instr(&instr->src[i].src);
 759
 760       if (inot_instr != NULL && inot_instr->op == nir_op_inot &&
 761           !inot_instr->src[0].abs && !inot_instr->src[0].negate) {
 762          /* The source of the inot is now the source of instr. */
 763          prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
 764
 765          assert(!op[i].negate);
 766          op[i].negate = true;
 767       } else {
 768          op[i] = resolve_source_modifiers(op[i]);
 769       }
 770    }
 771 }
 772
 773 bool
 774 fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
 775                                   fs_reg result,
 776                                   nir_alu_instr *instr)
 777 {
 778    if (devinfo->gen < 6 || devinfo->gen >= 12)
 779       return false;
 780
 781    nir_alu_instr *const inot_instr = nir_src_as_alu_instr(&instr->src[0].src);
 782
 783    if (inot_instr == NULL || inot_instr->op != nir_op_inot)
 784       return false;
 785
 786    /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
 787     * of valid size-changing combinations is a bit more complex.
 788     *
 789     * The source restriction is just because I was lazy about generating the
 790     * constant below.
 791     */
 792    if (nir_dest_bit_size(instr->dest.dest) != 32 ||
 793        nir_src_bit_size(inot_instr->src[0].src) != 32)
 794       return false;
 795
 796    /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
 797     * this is float(1 + a).
 798     */
 799    fs_reg op;
 800
 801    prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
 802
 803    /* Ignore the saturate modifier, if there is one.  The result of the
 804     * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
 805     */
 806    bld.ADD(result, op, brw_imm_d(1));
 807
 808    return true;
 809 }
 810
 811 void
 812 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 813 {
 814    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 815    fs_inst *inst;
 816
 817    fs_reg op[4];
 818    fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, true);
 819
 820    switch (instr->op) {
 821    case nir_op_imov:
 822    case nir_op_fmov:
 823    case nir_op_vec2:
 824    case nir_op_vec3:
 825    case nir_op_vec4: {
 826       fs_reg temp = result;
 827       bool need_extra_copy = false;
 828       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 829          if (!instr->src[i].src.is_ssa &&
 830              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
 831             need_extra_copy = true;
 832             temp = bld.vgrf(result.type, 4);
 833             break;
 834          }
 835       }
 836
 837       for (unsigned i = 0; i < 4; i++) {
 838          if (!(instr->dest.write_mask & (1 << i)))
 839             continue;
 840
 841          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
 842             inst = bld.MOV(offset(temp, bld, i),
 843                            offset(op[0], bld, instr->src[0].swizzle[i]));
 844          } else {
 845             inst = bld.MOV(offset(temp, bld, i),
 846                            offset(op[i], bld, instr->src[i].swizzle[0]));
 847          }
 848          inst->saturate = instr->dest.saturate;
 849       }
 850
 851       /* In this case the source and destination registers were the same,
 852        * so we need to insert an extra set of moves in order to deal with
 853        * any swizzling.
 854        */
 855       if (need_extra_copy) {
 856          for (unsigned i = 0; i < 4; i++) {
 857             if (!(instr->dest.write_mask & (1 << i)))
 858                continue;
 859
 860             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
 861          }
 862       }
 863       return;
 864    }
 865
 866    case nir_op_i2f32:
 867    case nir_op_u2f32:
 868       if (optimize_extract_to_float(instr, result))
 869          return;
 870       inst = bld.MOV(result, op[0]);
 871       inst->saturate = instr->dest.saturate;
 872       break;
 873
 874    case nir_op_f2f16_rtne:
 875    case nir_op_f2f16_rtz:
 876       bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
 877                brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
 878       /* fallthrough */
 879
 880       /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
 881        * on the HW gen, it is a special hw opcode or just a MOV, and
 882        * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
 883        *
 884        * But if we want to use that opcode, we need to provide support on
 885        * different optimizations and lowerings. As right now HF support is
 886        * only for gen8+, it will be better to use directly the MOV, and use
 887        * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
 888        */
 889
 890    case nir_op_f2f16:
 891       inst = bld.MOV(result, op[0]);
 892       inst->saturate = instr->dest.saturate;
 893       break;
 894
 895    case nir_op_f2f64:
 896    case nir_op_f2i64:
 897    case nir_op_f2u64:
 898       assert(type_sz(op[0].type) > 2); /* brw_nir_lower_conversions */
 899       inst = bld.MOV(result, op[0]);
 900       inst->saturate = instr->dest.saturate;
 901       break;
 902
 903    case nir_op_b2i8:
 904    case nir_op_b2i16:
 905    case nir_op_b2i32:
 906    case nir_op_b2i64:
 907    case nir_op_b2f16:
 908    case nir_op_b2f32:
 909    case nir_op_b2f64:
 910       if (try_emit_b2fi_of_inot(bld, result, instr))
 911          break;
 912       op[0].type = BRW_REGISTER_TYPE_D;
 913       op[0].negate = !op[0].negate;
 914       /* fallthrough */
 915    case nir_op_i2f64:
 916    case nir_op_i2i64:
 917    case nir_op_u2f64:
 918    case nir_op_u2u64:
 919       assert(type_sz(op[0].type) > 1); /* brw_nir_lower_conversions */
 920       /* fallthrough */
 921    case nir_op_f2f32:
 922    case nir_op_f2i32:
 923    case nir_op_f2u32:
 924    case nir_op_f2i16:
 925    case nir_op_f2u16:
 926    case nir_op_i2i32:
 927    case nir_op_u2u32:
 928    case nir_op_i2i16:
 929    case nir_op_u2u16:
 930    case nir_op_i2f16:
 931    case nir_op_u2f16:
 932    case nir_op_i2i8:
 933    case nir_op_u2u8:
 934       inst = bld.MOV(result, op[0]);
 935       inst->saturate = instr->dest.saturate;
 936       break;
 937
 938    case nir_op_fsign: {
 939       assert(!instr->dest.saturate);
 940       if (op[0].abs) {
 941          /* Straightforward since the source can be assumed to be either
 942           * strictly >= 0 or strictly <= 0 depending on the setting of the
 943           * negate flag.
 944           */
 945          set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
 946
 947          inst = (op[0].negate)
 948             ? bld.MOV(result, brw_imm_f(-1.0f))
 949             : bld.MOV(result, brw_imm_f(1.0f));
 950
 951          set_predicate(BRW_PREDICATE_NORMAL, inst);
 952       } else if (type_sz(op[0].type) < 8) {
 953          /* AND(val, 0x80000000) gives the sign bit.
 954           *
 955           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 956           * zero.
 957           */
 958          bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 959
 960          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 961          op[0].type = BRW_REGISTER_TYPE_UD;
 962          result.type = BRW_REGISTER_TYPE_UD;
 963          bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
 964
 965          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
 966          inst->predicate = BRW_PREDICATE_NORMAL;
 967       } else {
 968          /* For doubles we do the same but we need to consider:
 969           *
 970           * - 2-src instructions can't operate with 64-bit immediates
 971           * - The sign is encoded in the high 32-bit of each DF
 972           * - We need to produce a DF result.
 973           */
 974
 975          fs_reg zero = vgrf(glsl_type::double_type);
 976          bld.MOV(zero, setup_imm_df(bld, 0.0));
 977          bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
 978
 979          bld.MOV(result, zero);
 980
 981          fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
 982          bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
 983                  brw_imm_ud(0x80000000u));
 984
 985          set_predicate(BRW_PREDICATE_NORMAL,
 986                        bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
 987       }
 988       break;
 989    }
 990
 991    case nir_op_frcp:
 992       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
 993       inst->saturate = instr->dest.saturate;
 994       break;
 995
 996    case nir_op_fexp2:
 997       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
 998       inst->saturate = instr->dest.saturate;
 999       break;
1000
1001    case nir_op_flog2:
1002       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
1003       inst->saturate = instr->dest.saturate;
1004       break;
1005
1006    case nir_op_fsin:
1007       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
1008       inst->saturate = instr->dest.saturate;
1009       break;
1010
1011    case nir_op_fcos:
1012       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1013       inst->saturate = instr->dest.saturate;
1014       break;
1015
1016    case nir_op_fddx:
1017       if (fs_key->high_quality_derivatives) {
1018          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1019       } else {
1020          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1021       }
1022       inst->saturate = instr->dest.saturate;
1023       break;
1024    case nir_op_fddx_fine:
1025       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1026       inst->saturate = instr->dest.saturate;
1027       break;
1028    case nir_op_fddx_coarse:
1029       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1030       inst->saturate = instr->dest.saturate;
1031       break;
1032    case nir_op_fddy:
1033       if (fs_key->high_quality_derivatives) {
1034          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1035       } else {
1036          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1037       }
1038       inst->saturate = instr->dest.saturate;
1039       break;
1040    case nir_op_fddy_fine:
1041       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1042       inst->saturate = instr->dest.saturate;
1043       break;
1044    case nir_op_fddy_coarse:
1045       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1046       inst->saturate = instr->dest.saturate;
1047       break;
1048
1049    case nir_op_iadd:
1050    case nir_op_fadd:
1051       inst = bld.ADD(result, op[0], op[1]);
1052       inst->saturate = instr->dest.saturate;
1053       break;
1054
1055    case nir_op_uadd_sat:
1056       inst = bld.ADD(result, op[0], op[1]);
1057       inst->saturate = true;
1058       break;
1059
1060    case nir_op_fmul:
1061       inst = bld.MUL(result, op[0], op[1]);
1062       inst->saturate = instr->dest.saturate;
1063       break;
1064
1065    case nir_op_imul_2x32_64:
1066    case nir_op_umul_2x32_64:
1067       bld.MUL(result, op[0], op[1]);
1068       break;
1069
1070    case nir_op_imul:
1071       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1072       bld.MUL(result, op[0], op[1]);
1073       break;
1074
1075    case nir_op_imul_high:
1076    case nir_op_umul_high:
1077       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1078       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1079       break;
1080
1081    case nir_op_idiv:
1082    case nir_op_udiv:
1083       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1084       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1085       break;
1086
1087    case nir_op_uadd_carry:
1088       unreachable("Should have been lowered by carry_to_arith().");
1089
1090    case nir_op_usub_borrow:
1091       unreachable("Should have been lowered by borrow_to_arith().");
1092
1093    case nir_op_umod:
1094    case nir_op_irem:
1095       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1096        * appears that our hardware just does the right thing for signed
1097        * remainder.
1098        */
1099       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1100       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1101       break;
1102
1103    case nir_op_imod: {
1104       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1105       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1106
1107       /* Math instructions don't support conditional mod */
1108       inst = bld.MOV(bld.null_reg_d(), result);
1109       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1110
1111       /* Now, we need to determine if signs of the sources are different.
1112        * When we XOR the sources, the top bit is 0 if they are the same and 1
1113        * if they are different.  We can then use a conditional modifier to
1114        * turn that into a predicate.  This leads us to an XOR.l instruction.
1115        *
1116        * Technically, according to the PRM, you're not allowed to use .l on a
1117        * XOR instruction.  However, emperical experiments and Curro's reading
1118        * of the simulator source both indicate that it's safe.
1119        */
1120       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1121       inst = bld.XOR(tmp, op[0], op[1]);
1122       inst->predicate = BRW_PREDICATE_NORMAL;
1123       inst->conditional_mod = BRW_CONDITIONAL_L;
1124
1125       /* If the result of the initial remainder operation is non-zero and the
1126        * two sources have different signs, add in a copy of op[1] to get the
1127        * final integer modulus value.
1128        */
1129       inst = bld.ADD(result, result, op[1]);
1130       inst->predicate = BRW_PREDICATE_NORMAL;
1131       break;
1132    }
1133
1134    case nir_op_flt32:
1135    case nir_op_fge32:
1136    case nir_op_feq32:
1137    case nir_op_fne32: {
1138       fs_reg dest = result;
1139
1140       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1141       if (bit_size != 32)
1142          dest = bld.vgrf(op[0].type, 1);
1143
1144       brw_conditional_mod cond;
1145       switch (instr->op) {
1146       case nir_op_flt32:
1147          cond = BRW_CONDITIONAL_L;
1148          break;
1149       case nir_op_fge32:
1150          cond = BRW_CONDITIONAL_GE;
1151          break;
1152       case nir_op_feq32:
1153          cond = BRW_CONDITIONAL_Z;
1154          break;
1155       case nir_op_fne32:
1156          cond = BRW_CONDITIONAL_NZ;
1157          break;
1158       default:
1159          unreachable("bad opcode");
1160       }
1161
1162       bld.CMP(dest, op[0], op[1], cond);
1163
1164       if (bit_size > 32) {
1165          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1166       } else if(bit_size < 32) {
1167          /* When we convert the result to 32-bit we need to be careful and do
1168           * it as a signed conversion to get sign extension (for 32-bit true)
1169           */
1170          const brw_reg_type src_type =
1171             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1172
1173          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1174       }
1175       break;
1176    }
1177
1178    case nir_op_ilt32:
1179    case nir_op_ult32:
1180    case nir_op_ige32:
1181    case nir_op_uge32:
1182    case nir_op_ieq32:
1183    case nir_op_ine32: {
1184       fs_reg dest = result;
1185
1186       const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1187       if (bit_size != 32)
1188          dest = bld.vgrf(op[0].type, 1);
1189
1190       brw_conditional_mod cond;
1191       switch (instr->op) {
1192       case nir_op_ilt32:
1193       case nir_op_ult32:
1194          cond = BRW_CONDITIONAL_L;
1195          break;
1196       case nir_op_ige32:
1197       case nir_op_uge32:
1198          cond = BRW_CONDITIONAL_GE;
1199          break;
1200       case nir_op_ieq32:
1201          cond = BRW_CONDITIONAL_Z;
1202          break;
1203       case nir_op_ine32:
1204          cond = BRW_CONDITIONAL_NZ;
1205          break;
1206       default:
1207          unreachable("bad opcode");
1208       }
1209       bld.CMP(dest, op[0], op[1], cond);
1210
1211       if (bit_size > 32) {
1212          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1213       } else if (bit_size < 32) {
1214          /* When we convert the result to 32-bit we need to be careful and do
1215           * it as a signed conversion to get sign extension (for 32-bit true)
1216           */
1217          const brw_reg_type src_type =
1218             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1219
1220          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1221       }
1222       break;
1223    }
1224
1225    case nir_op_inot:
1226       if (devinfo->gen >= 8) {
1227          nir_alu_instr *const inot_src_instr =
1228             nir_src_as_alu_instr(&instr->src[0].src);
1229
1230          if (inot_src_instr != NULL &&
1231              (inot_src_instr->op == nir_op_ior ||
1232               inot_src_instr->op == nir_op_ixor ||
1233               inot_src_instr->op == nir_op_iand) &&
1234              !inot_src_instr->src[0].abs &&
1235              !inot_src_instr->src[0].negate &&
1236              !inot_src_instr->src[1].abs &&
1237              !inot_src_instr->src[1].negate) {
1238             /* The sources of the source logical instruction are now the
1239              * sources of the instruction that will be generated.
1240              */
1241             prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
1242             resolve_inot_sources(bld, inot_src_instr, op);
1243
1244             /* Smash all of the sources and destination to be signed.  This
1245              * doesn't matter for the operation of the instruction, but cmod
1246              * propagation fails on unsigned sources with negation (due to
1247              * fs_inst::can_do_cmod returning false).
1248              */
1249             result.type =
1250                brw_type_for_nir_type(devinfo,
1251                                      (nir_alu_type)(nir_type_int |
1252                                                     nir_dest_bit_size(instr->dest.dest)));
1253             op[0].type =
1254                brw_type_for_nir_type(devinfo,
1255                                      (nir_alu_type)(nir_type_int |
1256                                                     nir_src_bit_size(inot_src_instr->src[0].src)));
1257             op[1].type =
1258                brw_type_for_nir_type(devinfo,
1259                                      (nir_alu_type)(nir_type_int |
1260                                                     nir_src_bit_size(inot_src_instr->src[1].src)));
1261
1262             /* For XOR, only invert one of the sources.  Arbitrarily choose
1263              * the first source.
1264              */
1265             op[0].negate = !op[0].negate;
1266             if (inot_src_instr->op != nir_op_ixor)
1267                op[1].negate = !op[1].negate;
1268
1269             switch (inot_src_instr->op) {
1270             case nir_op_ior:
1271                bld.AND(result, op[0], op[1]);
1272                return;
1273
1274             case nir_op_iand:
1275                bld.OR(result, op[0], op[1]);
1276                return;
1277
1278             case nir_op_ixor:
1279                bld.XOR(result, op[0], op[1]);
1280                return;
1281
1282             default:
1283                unreachable("impossible opcode");
1284             }
1285          }
1286          op[0] = resolve_source_modifiers(op[0]);
1287       }
1288       bld.NOT(result, op[0]);
1289       break;
1290    case nir_op_ixor:
1291       if (devinfo->gen >= 8) {
1292          resolve_inot_sources(bld, instr, op);
1293       }
1294       bld.XOR(result, op[0], op[1]);
1295       break;
1296    case nir_op_ior:
1297       if (devinfo->gen >= 8) {
1298          resolve_inot_sources(bld, instr, op);
1299       }
1300       bld.OR(result, op[0], op[1]);
1301       break;
1302    case nir_op_iand:
1303       if (devinfo->gen >= 8) {
1304          resolve_inot_sources(bld, instr, op);
1305       }
1306       bld.AND(result, op[0], op[1]);
1307       break;
1308
1309    case nir_op_fdot2:
1310    case nir_op_fdot3:
1311    case nir_op_fdot4:
1312    case nir_op_b32all_fequal2:
1313    case nir_op_b32all_iequal2:
1314    case nir_op_b32all_fequal3:
1315    case nir_op_b32all_iequal3:
1316    case nir_op_b32all_fequal4:
1317    case nir_op_b32all_iequal4:
1318    case nir_op_b32any_fnequal2:
1319    case nir_op_b32any_inequal2:
1320    case nir_op_b32any_fnequal3:
1321    case nir_op_b32any_inequal3:
1322    case nir_op_b32any_fnequal4:
1323    case nir_op_b32any_inequal4:
1324       unreachable("Lowered by nir_lower_alu_reductions");
1325
1326    case nir_op_fnoise1_1:
1327    case nir_op_fnoise1_2:
1328    case nir_op_fnoise1_3:
1329    case nir_op_fnoise1_4:
1330    case nir_op_fnoise2_1:
1331    case nir_op_fnoise2_2:
1332    case nir_op_fnoise2_3:
1333    case nir_op_fnoise2_4:
1334    case nir_op_fnoise3_1:
1335    case nir_op_fnoise3_2:
1336    case nir_op_fnoise3_3:
1337    case nir_op_fnoise3_4:
1338    case nir_op_fnoise4_1:
1339    case nir_op_fnoise4_2:
1340    case nir_op_fnoise4_3:
1341    case nir_op_fnoise4_4:
1342       unreachable("not reached: should be handled by lower_noise");
1343
1344    case nir_op_ldexp:
1345       unreachable("not reached: should be handled by ldexp_to_arith()");
1346
1347    case nir_op_fsqrt:
1348       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1349       inst->saturate = instr->dest.saturate;
1350       break;
1351
1352    case nir_op_frsq:
1353       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1354       inst->saturate = instr->dest.saturate;
1355       break;
1356
1357    case nir_op_i2b32:
1358    case nir_op_f2b32: {
1359       uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1360       if (bit_size == 64) {
1361          /* two-argument instructions can't take 64-bit immediates */
1362          fs_reg zero;
1363          fs_reg tmp;
1364
1365          if (instr->op == nir_op_f2b32) {
1366             zero = vgrf(glsl_type::double_type);
1367             tmp = vgrf(glsl_type::double_type);
1368             bld.MOV(zero, setup_imm_df(bld, 0.0));
1369          } else {
1370             zero = vgrf(glsl_type::int64_t_type);
1371             tmp = vgrf(glsl_type::int64_t_type);
1372             bld.MOV(zero, brw_imm_q(0));
1373          }
1374
1375          /* A SIMD16 execution needs to be split in two instructions, so use
1376           * a vgrf instead of the flag register as dst so instruction splitting
1377           * works
1378           */
1379          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1380          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1381       } else {
1382          fs_reg zero;
1383          if (bit_size == 32) {
1384             zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);
1385          } else {
1386             assert(bit_size == 16);
1387             zero = instr->op == nir_op_f2b32 ?
1388                retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1389          }
1390          bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1391       }
1392       break;
1393    }
1394
1395    case nir_op_ftrunc:
1396       inst = bld.RNDZ(result, op[0]);
1397       inst->saturate = instr->dest.saturate;
1398       break;
1399
1400    case nir_op_fceil: {
1401       op[0].negate = !op[0].negate;
1402       fs_reg temp = vgrf(glsl_type::float_type);
1403       bld.RNDD(temp, op[0]);
1404       temp.negate = true;
1405       inst = bld.MOV(result, temp);
1406       inst->saturate = instr->dest.saturate;
1407       break;
1408    }
1409    case nir_op_ffloor:
1410       inst = bld.RNDD(result, op[0]);
1411       inst->saturate = instr->dest.saturate;
1412       break;
1413    case nir_op_ffract:
1414       inst = bld.FRC(result, op[0]);
1415       inst->saturate = instr->dest.saturate;
1416       break;
1417    case nir_op_fround_even:
1418       inst = bld.RNDE(result, op[0]);
1419       inst->saturate = instr->dest.saturate;
1420       break;
1421
1422    case nir_op_fquantize2f16: {
1423       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1424       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1425       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1426
1427       /* The destination stride must be at least as big as the source stride. */
1428       tmp16.type = BRW_REGISTER_TYPE_W;
1429       tmp16.stride = 2;
1430
1431       /* Check for denormal */
1432       fs_reg abs_src0 = op[0];
1433       abs_src0.abs = true;
1434       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1435               BRW_CONDITIONAL_L);
1436       /* Get the appropriately signed zero */
1437       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1438               retype(op[0], BRW_REGISTER_TYPE_UD),
1439               brw_imm_ud(0x80000000));
1440       /* Do the actual F32 -> F16 -> F32 conversion */
1441       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1442       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1443       /* Select that or zero based on normal status */
1444       inst = bld.SEL(result, zero, tmp32);
1445       inst->predicate = BRW_PREDICATE_NORMAL;
1446       inst->saturate = instr->dest.saturate;
1447       break;
1448    }
1449
1450    case nir_op_imin:
1451    case nir_op_umin:
1452    case nir_op_fmin:
1453       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1454       inst->saturate = instr->dest.saturate;
1455       break;
1456
1457    case nir_op_imax:
1458    case nir_op_umax:
1459    case nir_op_fmax:
1460       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1461       inst->saturate = instr->dest.saturate;
1462       break;
1463
1464    case nir_op_pack_snorm_2x16:
1465    case nir_op_pack_snorm_4x8:
1466    case nir_op_pack_unorm_2x16:
1467    case nir_op_pack_unorm_4x8:
1468    case nir_op_unpack_snorm_2x16:
1469    case nir_op_unpack_snorm_4x8:
1470    case nir_op_unpack_unorm_2x16:
1471    case nir_op_unpack_unorm_4x8:
1472    case nir_op_unpack_half_2x16:
1473    case nir_op_pack_half_2x16:
1474       unreachable("not reached: should be handled by lower_packing_builtins");
1475
1476    case nir_op_unpack_half_2x16_split_x:
1477       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1478                       subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1479       inst->saturate = instr->dest.saturate;
1480       break;
1481    case nir_op_unpack_half_2x16_split_y:
1482       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1483                       subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1484       inst->saturate = instr->dest.saturate;
1485       break;
1486
1487    case nir_op_pack_64_2x32_split:
1488    case nir_op_pack_32_2x16_split:
1489       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1490       break;
1491
1492    case nir_op_unpack_64_2x32_split_x:
1493    case nir_op_unpack_64_2x32_split_y: {
1494       if (instr->op == nir_op_unpack_64_2x32_split_x)
1495          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1496       else
1497          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1498       break;
1499    }
1500
1501    case nir_op_unpack_32_2x16_split_x:
1502    case nir_op_unpack_32_2x16_split_y: {
1503       if (instr->op == nir_op_unpack_32_2x16_split_x)
1504          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1505       else
1506          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1507       break;
1508    }
1509
1510    case nir_op_fpow:
1511       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1512       inst->saturate = instr->dest.saturate;
1513       break;
1514
1515    case nir_op_bitfield_reverse:
1516       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1517       bld.BFREV(result, op[0]);
1518       break;
1519
1520    case nir_op_bit_count:
1521       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1522       bld.CBIT(result, op[0]);
1523       break;
1524
1525    case nir_op_ufind_msb: {
1526       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1527       emit_find_msb_using_lzd(bld, result, op[0], false);
1528       break;
1529    }
1530
1531    case nir_op_ifind_msb: {
1532       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1533
1534       if (devinfo->gen < 7) {
1535          emit_find_msb_using_lzd(bld, result, op[0], true);
1536       } else {
1537          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1538
1539          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1540           * count from the LSB side. If FBH didn't return an error
1541           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1542           * count into an LSB count.
1543           */
1544          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1545
1546          inst = bld.ADD(result, result, brw_imm_d(31));
1547          inst->predicate = BRW_PREDICATE_NORMAL;
1548          inst->src[0].negate = true;
1549       }
1550       break;
1551    }
1552
1553    case nir_op_find_lsb:
1554       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1555
1556       if (devinfo->gen < 7) {
1557          fs_reg temp = vgrf(glsl_type::int_type);
1558
1559          /* (x & -x) generates a value that consists of only the LSB of x.
1560           * For all powers of 2, findMSB(y) == findLSB(y).
1561           */
1562          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1563          fs_reg negated_src = src;
1564
1565          /* One must be negated, and the other must be non-negated.  It
1566           * doesn't matter which is which.
1567           */
1568          negated_src.negate = true;
1569          src.negate = false;
1570
1571          bld.AND(temp, src, negated_src);
1572          emit_find_msb_using_lzd(bld, result, temp, false);
1573       } else {
1574          bld.FBL(result, op[0]);
1575       }
1576       break;
1577
1578    case nir_op_ubitfield_extract:
1579    case nir_op_ibitfield_extract:
1580       unreachable("should have been lowered");
1581    case nir_op_ubfe:
1582    case nir_op_ibfe:
1583       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1584       bld.BFE(result, op[2], op[1], op[0]);
1585       break;
1586    case nir_op_bfm:
1587       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1588       bld.BFI1(result, op[0], op[1]);
1589       break;
1590    case nir_op_bfi:
1591       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1592       bld.BFI2(result, op[0], op[1], op[2]);
1593       break;
1594
1595    case nir_op_bitfield_insert:
1596       unreachable("not reached: should have been lowered");
1597
1598    case nir_op_ishl:
1599       bld.SHL(result, op[0], op[1]);
1600       break;
1601    case nir_op_ishr:
1602       bld.ASR(result, op[0], op[1]);
1603       break;
1604    case nir_op_ushr:
1605       bld.SHR(result, op[0], op[1]);
1606       break;
1607
1608    case nir_op_pack_half_2x16_split:
1609       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1610       break;
1611
1612    case nir_op_ffma:
1613       inst = bld.MAD(result, op[2], op[1], op[0]);
1614       inst->saturate = instr->dest.saturate;
1615       break;
1616
1617    case nir_op_flrp:
1618       inst = bld.LRP(result, op[0], op[1], op[2]);
1619       inst->saturate = instr->dest.saturate;
1620       break;
1621
1622    case nir_op_b32csel:
1623       if (optimize_frontfacing_ternary(instr, result))
1624          return;
1625
1626       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1627       inst = bld.SEL(result, op[1], op[2]);
1628       inst->predicate = BRW_PREDICATE_NORMAL;
1629       break;
1630
1631    case nir_op_extract_u8:
1632    case nir_op_extract_i8: {
1633       unsigned byte = nir_src_as_uint(instr->src[1].src);
1634
1635       /* The PRMs say:
1636        *
1637        *    BDW+
1638        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1639        *    Use two instructions and a word or DWord intermediate integer type.
1640        */
1641       if (nir_dest_bit_size(instr->dest.dest) == 64) {
1642          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1643
1644          if (instr->op == nir_op_extract_i8) {
1645             /* If we need to sign extend, extract to a word first */
1646             fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1647             bld.MOV(w_temp, subscript(op[0], type, byte));
1648             bld.MOV(result, w_temp);
1649          } else if (byte & 1) {
1650             /* Extract the high byte from the word containing the desired byte
1651              * offset.
1652              */
1653             bld.SHR(result,
1654                     subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1655                     brw_imm_uw(8));
1656          } else {
1657             /* Otherwise use an AND with 0xff and a word type */
1658             bld.AND(result,
1659                     subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1660                     brw_imm_uw(0xff));
1661          }
1662       } else {
1663          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1664          bld.MOV(result, subscript(op[0], type, byte));
1665       }
1666       break;
1667    }
1668
1669    case nir_op_extract_u16:
1670    case nir_op_extract_i16: {
1671       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1672       unsigned word = nir_src_as_uint(instr->src[1].src);
1673       bld.MOV(result, subscript(op[0], type, word));
1674       break;
1675    }
1676
1677    default:
1678       unreachable("unhandled instruction");
1679    }
1680
1681    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1682     * to sign extend the low bit to 0/~0
1683     */
1684    if (devinfo->gen <= 5 &&
1685        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1686       fs_reg masked = vgrf(glsl_type::int_type);
1687       bld.AND(masked, result, brw_imm_d(1));
1688       masked.negate = true;
1689       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1690    }
1691 }
1692
1693 void
1694 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1695                                 nir_load_const_instr *instr)
1696 {
1697    const brw_reg_type reg_type =
1698       brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1699    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1700
1701    switch (instr->def.bit_size) {
1702    case 8:
1703       for (unsigned i = 0; i < instr->def.num_components; i++)
1704          bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
1705       break;
1706
1707    case 16:
1708       for (unsigned i = 0; i < instr->def.num_components; i++)
1709          bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
1710       break;
1711
1712    case 32:
1713       for (unsigned i = 0; i < instr->def.num_components; i++)
1714          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
1715       break;
1716
1717    case 64:
1718       assert(devinfo->gen >= 7);
1719       if (devinfo->gen == 7) {
1720          /* We don't get 64-bit integer types until gen8 */
1721          for (unsigned i = 0; i < instr->def.num_components; i++) {
1722             bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1723                     setup_imm_df(bld, instr->value[i].f64));
1724          }
1725       } else {
1726          for (unsigned i = 0; i < instr->def.num_components; i++)
1727             bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
1728       }
1729       break;
1730
1731    default:
1732       unreachable("Invalid bit size");
1733    }
1734
1735    nir_ssa_values[instr->def.index] = reg;
1736 }
1737
1738 fs_reg
1739 fs_visitor::get_nir_src(const nir_src &src)
1740 {
1741    fs_reg reg;
1742    if (src.is_ssa) {
1743       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1744          const brw_reg_type reg_type =
1745             brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
1746          reg = bld.vgrf(reg_type, src.ssa->num_components);
1747       } else {
1748          reg = nir_ssa_values[src.ssa->index];
1749       }
1750    } else {
1751       /* We don't handle indirects on locals */
1752       assert(src.reg.indirect == NULL);
1753       reg = offset(nir_locals[src.reg.reg->index], bld,
1754                    src.reg.base_offset * src.reg.reg->num_components);
1755    }
1756
1757    if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
1758       /* The only 64-bit type available on gen7 is DF, so use that. */
1759       reg.type = BRW_REGISTER_TYPE_DF;
1760    } else {
1761       /* To avoid floating-point denorm flushing problems, set the type by
1762        * default to an integer type - instructions that need floating point
1763        * semantics will set this to F if they need to
1764        */
1765       reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
1766                                             BRW_REGISTER_TYPE_D);
1767    }
1768
1769    return reg;
1770 }
1771
1772 /**
1773  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1774  *
1775  * This function should not be called on any value which may be 64 bits.
1776  * We could theoretically support 64-bit on gen8+ but we choose not to
1777  * because it wouldn't work in general (no gen7 support) and there are
1778  * enough restrictions in 64-bit immediates that you can't take the return
1779  * value and treat it the same as the result of get_nir_src().
1780  */
1781 fs_reg
1782 fs_visitor::get_nir_src_imm(const nir_src &src)
1783 {
1784    assert(nir_src_bit_size(src) == 32);
1785    return nir_src_is_const(src) ?
1786           fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
1787 }
1788
1789 fs_reg
1790 fs_visitor::get_nir_dest(const nir_dest &dest)
1791 {
1792    if (dest.is_ssa) {
1793       const brw_reg_type reg_type =
1794          brw_reg_type_from_bit_size(dest.ssa.bit_size,
1795                                     dest.ssa.bit_size == 8 ?
1796                                     BRW_REGISTER_TYPE_D :
1797                                     BRW_REGISTER_TYPE_F);
1798       nir_ssa_values[dest.ssa.index] =
1799          bld.vgrf(reg_type, dest.ssa.num_components);
1800       return nir_ssa_values[dest.ssa.index];
1801    } else {
1802       /* We don't handle indirects on locals */
1803       assert(dest.reg.indirect == NULL);
1804       return offset(nir_locals[dest.reg.reg->index], bld,
1805                     dest.reg.base_offset * dest.reg.reg->num_components);
1806    }
1807 }
1808
1809 void
1810 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1811                          unsigned wr_mask)
1812 {
1813    for (unsigned i = 0; i < 4; i++) {
1814       if (!((wr_mask >> i) & 1))
1815          continue;
1816
1817       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1818       new_inst->dst = offset(new_inst->dst, bld, i);
1819       for (unsigned j = 0; j < new_inst->sources; j++)
1820          if (new_inst->src[j].file == VGRF)
1821             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1822
1823       bld.emit(new_inst);
1824    }
1825 }
1826
1827 static fs_inst *
1828 emit_pixel_interpolater_send(const fs_builder &bld,
1829                              enum opcode opcode,
1830                              const fs_reg &dst,
1831                              const fs_reg &src,
1832                              const fs_reg &desc,
1833                              glsl_interp_mode interpolation)
1834 {
1835    struct brw_wm_prog_data *wm_prog_data =
1836       brw_wm_prog_data(bld.shader->stage_prog_data);
1837
1838    fs_inst *inst = bld.emit(opcode, dst, src, desc);
1839    /* 2 floats per slot returned */
1840    inst->size_written = 2 * dst.component_size(inst->exec_size);
1841    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1842
1843    wm_prog_data->pulls_bary = true;
1844
1845    return inst;
1846 }
1847
1848 /**
1849  * Computes 1 << x, given a D/UD register containing some value x.
1850  */
1851 static fs_reg
1852 intexp2(const fs_builder &bld, const fs_reg &x)
1853 {
1854    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1855
1856    fs_reg result = bld.vgrf(x.type, 1);
1857    fs_reg one = bld.vgrf(x.type, 1);
1858
1859    bld.MOV(one, retype(brw_imm_d(1), one.type));
1860    bld.SHL(result, one, x);
1861    return result;
1862 }
1863
1864 void
1865 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1866 {
1867    assert(stage == MESA_SHADER_GEOMETRY);
1868
1869    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1870
1871    if (gs_compile->control_data_header_size_bits == 0)
1872       return;
1873
1874    /* We can only do EndPrimitive() functionality when the control data
1875     * consists of cut bits.  Fortunately, the only time it isn't is when the
1876     * output type is points, in which case EndPrimitive() is a no-op.
1877     */
1878    if (gs_prog_data->control_data_format !=
1879        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1880       return;
1881    }
1882
1883    /* Cut bits use one bit per vertex. */
1884    assert(gs_compile->control_data_bits_per_vertex == 1);
1885
1886    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1887    vertex_count.type = BRW_REGISTER_TYPE_UD;
1888
1889    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1890     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1891     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1892     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1893     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1894     *
1895     * Note that if EndPrimitive() is called before emitting any vertices, this
1896     * will cause us to set bit 31 of the control_data_bits register to 1.
1897     * That's fine because:
1898     *
1899     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1900     *   output, so the hardware will ignore cut bit 31.
1901     *
1902     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1903     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1904     *   is automatically ended when the GS terminates).
1905     *
1906     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1907     *   control_data_bits register to 0 when the first vertex is emitted.
1908     */
1909
1910    const fs_builder abld = bld.annotate("end primitive");
1911
1912    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1913    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1914    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1915    fs_reg mask = intexp2(abld, prev_count);
1916    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1917     * attention to the lower 5 bits of its second source argument, so on this
1918     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1919     * ((vertex_count - 1) % 32).
1920     */
1921    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1922 }
1923
1924 void
1925 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1926 {
1927    assert(stage == MESA_SHADER_GEOMETRY);
1928    assert(gs_compile->control_data_bits_per_vertex != 0);
1929
1930    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1931
1932    const fs_builder abld = bld.annotate("emit control data bits");
1933    const fs_builder fwa_bld = bld.exec_all();
1934
1935    /* We use a single UD register to accumulate control data bits (32 bits
1936     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1937     * at a time.
1938     *
1939     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1940     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1941     * use the Channel Mask phase to enable/disable which DWord within that
1942     * group to write.  (Remember, different SIMD8 channels may have emitted
1943     * different numbers of vertices, so we may need per-slot offsets.)
1944     *
1945     * Channel masking presents an annoying problem: we may have to replicate
1946     * the data up to 4 times:
1947     *
1948     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1949     *
1950     * To avoid penalizing shaders that emit a small number of vertices, we
1951     * can avoid these sometimes: if the size of the control data header is
1952     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1953     * land in the same 128-bit group, so we can skip per-slot offsets.
1954     *
1955     * Similarly, if the control data header is <= 32 bits, there is only one
1956     * DWord, so we can skip channel masks.
1957     */
1958    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1959
1960    fs_reg channel_mask, per_slot_offset;
1961
1962    if (gs_compile->control_data_header_size_bits > 32) {
1963       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1964       channel_mask = vgrf(glsl_type::uint_type);
1965    }
1966
1967    if (gs_compile->control_data_header_size_bits > 128) {
1968       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1969       per_slot_offset = vgrf(glsl_type::uint_type);
1970    }
1971
1972    /* Figure out which DWord we're trying to write to using the formula:
1973     *
1974     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1975     *
1976     * Since bits_per_vertex is a power of two, and is known at compile
1977     * time, this can be optimized to:
1978     *
1979     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1980     */
1981    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1982       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1983       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1984       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1985       unsigned log2_bits_per_vertex =
1986          util_last_bit(gs_compile->control_data_bits_per_vertex);
1987       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1988
1989       if (per_slot_offset.file != BAD_FILE) {
1990          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1991           * the appropriate OWord within the control data header.
1992           */
1993          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1994       }
1995
1996       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1997        * write to the appropriate DWORD within the OWORD.
1998        */
1999       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2000       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
2001       channel_mask = intexp2(fwa_bld, channel);
2002       /* Then the channel masks need to be in bits 23:16. */
2003       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
2004    }
2005
2006    /* Store the control data bits in the message payload and send it. */
2007    unsigned mlen = 2;
2008    if (channel_mask.file != BAD_FILE)
2009       mlen += 4; /* channel masks, plus 3 extra copies of the data */
2010    if (per_slot_offset.file != BAD_FILE)
2011       mlen++;
2012
2013    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2014    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
2015    unsigned i = 0;
2016    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
2017    if (per_slot_offset.file != BAD_FILE)
2018       sources[i++] = per_slot_offset;
2019    if (channel_mask.file != BAD_FILE)
2020       sources[i++] = channel_mask;
2021    while (i < mlen) {
2022       sources[i++] = this->control_data_bits;
2023    }
2024
2025    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
2026    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
2027    inst->mlen = mlen;
2028    /* We need to increment Global Offset by 256-bits to make room for
2029     * Broadwell's extra "Vertex Count" payload at the beginning of the
2030     * URB entry.  Since this is an OWord message, Global Offset is counted
2031     * in 128-bit units, so we must set it to 2.
2032     */
2033    if (gs_prog_data->static_vertex_count == -1)
2034       inst->offset = 2;
2035 }
2036
2037 void
2038 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
2039                                             unsigned stream_id)
2040 {
2041    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2042
2043    /* Note: we are calling this *before* increasing vertex_count, so
2044     * this->vertex_count == vertex_count - 1 in the formula above.
2045     */
2046
2047    /* Stream mode uses 2 bits per vertex */
2048    assert(gs_compile->control_data_bits_per_vertex == 2);
2049
2050    /* Must be a valid stream */
2051    assert(stream_id < MAX_VERTEX_STREAMS);
2052
2053    /* Control data bits are initialized to 0 so we don't have to set any
2054     * bits when sending vertices to stream 0.
2055     */
2056    if (stream_id == 0)
2057       return;
2058
2059    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
2060
2061    /* reg::sid = stream_id */
2062    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2063    abld.MOV(sid, brw_imm_ud(stream_id));
2064
2065    /* reg:shift_count = 2 * (vertex_count - 1) */
2066    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2067    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2068
2069    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2070     * attention to the lower 5 bits of its second source argument, so on this
2071     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2072     * stream_id << ((2 * (vertex_count - 1)) % 32).
2073     */
2074    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2075    abld.SHL(mask, sid, shift_count);
2076    abld.OR(this->control_data_bits, this->control_data_bits, mask);
2077 }
2078
2079 void
2080 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
2081                            unsigned stream_id)
2082 {
2083    assert(stage == MESA_SHADER_GEOMETRY);
2084
2085    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2086
2087    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2088    vertex_count.type = BRW_REGISTER_TYPE_UD;
2089
2090    /* Haswell and later hardware ignores the "Render Stream Select" bits
2091     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2092     * and instead sends all primitives down the pipeline for rasterization.
2093     * If the SOL stage is enabled, "Render Stream Select" is honored and
2094     * primitives bound to non-zero streams are discarded after stream output.
2095     *
2096     * Since the only purpose of primives sent to non-zero streams is to
2097     * be recorded by transform feedback, we can simply discard all geometry
2098     * bound to these streams when transform feedback is disabled.
2099     */
2100    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
2101       return;
2102
2103    /* If we're outputting 32 control data bits or less, then we can wait
2104     * until the shader is over to output them all.  Otherwise we need to
2105     * output them as we go.  Now is the time to do it, since we're about to
2106     * output the vertex_count'th vertex, so it's guaranteed that the
2107     * control data bits associated with the (vertex_count - 1)th vertex are
2108     * correct.
2109     */
2110    if (gs_compile->control_data_header_size_bits > 32) {
2111       const fs_builder abld =
2112          bld.annotate("emit vertex: emit control data bits");
2113
2114       /* Only emit control data bits if we've finished accumulating a batch
2115        * of 32 bits.  This is the case when:
2116        *
2117        *     (vertex_count * bits_per_vertex) % 32 == 0
2118        *
2119        * (in other words, when the last 5 bits of vertex_count *
2120        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2121        * integer n (which is always the case, since bits_per_vertex is
2122        * always 1 or 2), this is equivalent to requiring that the last 5-n
2123        * bits of vertex_count are 0:
2124        *
2125        *     vertex_count & (2^(5-n) - 1) == 0
2126        *
2127        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2128        * equivalent to:
2129        *
2130        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2131        *
2132        * TODO: If vertex_count is an immediate, we could do some of this math
2133        *       at compile time...
2134        */
2135       fs_inst *inst =
2136          abld.AND(bld.null_reg_d(), vertex_count,
2137                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2138       inst->conditional_mod = BRW_CONDITIONAL_Z;
2139
2140       abld.IF(BRW_PREDICATE_NORMAL);
2141       /* If vertex_count is 0, then no control data bits have been
2142        * accumulated yet, so we can skip emitting them.
2143        */
2144       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2145                BRW_CONDITIONAL_NEQ);
2146       abld.IF(BRW_PREDICATE_NORMAL);
2147       emit_gs_control_data_bits(vertex_count);
2148       abld.emit(BRW_OPCODE_ENDIF);
2149
2150       /* Reset control_data_bits to 0 so we can start accumulating a new
2151        * batch.
2152        *
2153        * Note: in the case where vertex_count == 0, this neutralizes the
2154        * effect of any call to EndPrimitive() that the shader may have
2155        * made before outputting its first vertex.
2156        */
2157       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2158       inst->force_writemask_all = true;
2159       abld.emit(BRW_OPCODE_ENDIF);
2160    }
2161
2162    emit_urb_writes(vertex_count);
2163
2164    /* In stream mode we have to set control data bits for all vertices
2165     * unless we have disabled control data bits completely (which we do
2166     * do for GL_POINTS outputs that don't use streams).
2167     */
2168    if (gs_compile->control_data_header_size_bits > 0 &&
2169        gs_prog_data->control_data_format ==
2170           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2171       set_gs_stream_control_data_bits(vertex_count, stream_id);
2172    }
2173 }
2174
2175 void
2176 fs_visitor::emit_gs_input_load(const fs_reg &dst,
2177                                const nir_src &vertex_src,
2178                                unsigned base_offset,
2179                                const nir_src &offset_src,
2180                                unsigned num_components,
2181                                unsigned first_component)
2182 {
2183    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2184    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2185
2186    /* TODO: figure out push input layout for invocations == 1 */
2187    /* TODO: make this work with 64-bit inputs */
2188    if (gs_prog_data->invocations == 1 &&
2189        type_sz(dst.type) <= 4 &&
2190        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2191        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2192       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2193                        nir_src_as_uint(vertex_src) * push_reg_count;
2194       for (unsigned i = 0; i < num_components; i++) {
2195          bld.MOV(offset(dst, bld, i),
2196                  fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2197       }
2198       return;
2199    }
2200
2201    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2202    assert(gs_prog_data->base.include_vue_handles);
2203
2204    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2205    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2206
2207    if (gs_prog_data->invocations == 1) {
2208       if (nir_src_is_const(vertex_src)) {
2209          /* The vertex index is constant; just select the proper URB handle. */
2210          icp_handle =
2211             retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2212                    BRW_REGISTER_TYPE_UD);
2213       } else {
2214          /* The vertex index is non-constant.  We need to use indirect
2215           * addressing to fetch the proper URB handle.
2216           *
2217           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2218           * indicating that channel <n> should read the handle from
2219           * DWord <n>.  We convert that to bytes by multiplying by 4.
2220           *
2221           * Next, we convert the vertex index to bytes by multiplying
2222           * by 32 (shifting by 5), and add the two together.  This is
2223           * the final indirect byte offset.
2224           */
2225          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2226          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2227          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2228          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2229
2230          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2231          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2232          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2233          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2234          /* Convert vertex_index to bytes (multiply by 32) */
2235          bld.SHL(vertex_offset_bytes,
2236                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2237                  brw_imm_ud(5u));
2238          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2239
2240          /* Use first_icp_handle as the base offset.  There is one register
2241           * of URB handles per vertex, so inform the register allocator that
2242           * we might read up to nir->info.gs.vertices_in registers.
2243           */
2244          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2245                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2246                   fs_reg(icp_offset_bytes),
2247                   brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2248       }
2249    } else {
2250       assert(gs_prog_data->invocations > 1);
2251
2252       if (nir_src_is_const(vertex_src)) {
2253          unsigned vertex = nir_src_as_uint(vertex_src);
2254          assert(devinfo->gen >= 9 || vertex <= 5);
2255          bld.MOV(icp_handle,
2256                  retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2257                         BRW_REGISTER_TYPE_UD));
2258       } else {
2259          /* The vertex index is non-constant.  We need to use indirect
2260           * addressing to fetch the proper URB handle.
2261           *
2262           */
2263          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2264
2265          /* Convert vertex_index to bytes (multiply by 4) */
2266          bld.SHL(icp_offset_bytes,
2267                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2268                  brw_imm_ud(2u));
2269
2270          /* Use first_icp_handle as the base offset.  There is one DWord
2271           * of URB handles per vertex, so inform the register allocator that
2272           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2273           */
2274          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2275                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2276                   fs_reg(icp_offset_bytes),
2277                   brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2278                              REG_SIZE));
2279       }
2280    }
2281
2282    fs_inst *inst;
2283
2284    fs_reg tmp_dst = dst;
2285    fs_reg indirect_offset = get_nir_src(offset_src);
2286    unsigned num_iterations = 1;
2287    unsigned orig_num_components = num_components;
2288
2289    if (type_sz(dst.type) == 8) {
2290       if (num_components > 2) {
2291          num_iterations = 2;
2292          num_components = 2;
2293       }
2294       fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2295       tmp_dst = tmp;
2296       first_component = first_component / 2;
2297    }
2298
2299    for (unsigned iter = 0; iter < num_iterations; iter++) {
2300       if (nir_src_is_const(offset_src)) {
2301          /* Constant indexing - use global offset. */
2302          if (first_component != 0) {
2303             unsigned read_components = num_components + first_component;
2304             fs_reg tmp = bld.vgrf(dst.type, read_components);
2305             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2306             inst->size_written = read_components *
2307                                  tmp.component_size(inst->exec_size);
2308             for (unsigned i = 0; i < num_components; i++) {
2309                bld.MOV(offset(tmp_dst, bld, i),
2310                        offset(tmp, bld, i + first_component));
2311             }
2312          } else {
2313             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2314                             icp_handle);
2315             inst->size_written = num_components *
2316                                  tmp_dst.component_size(inst->exec_size);
2317          }
2318          inst->offset = base_offset + nir_src_as_uint(offset_src);
2319          inst->mlen = 1;
2320       } else {
2321          /* Indirect indexing - use per-slot offsets as well. */
2322          const fs_reg srcs[] = { icp_handle, indirect_offset };
2323          unsigned read_components = num_components + first_component;
2324          fs_reg tmp = bld.vgrf(dst.type, read_components);
2325          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2326          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2327          if (first_component != 0) {
2328             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2329                             payload);
2330             inst->size_written = read_components *
2331                                  tmp.component_size(inst->exec_size);
2332             for (unsigned i = 0; i < num_components; i++) {
2333                bld.MOV(offset(tmp_dst, bld, i),
2334                        offset(tmp, bld, i + first_component));
2335             }
2336          } else {
2337             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2338                          payload);
2339             inst->size_written = num_components *
2340                                  tmp_dst.component_size(inst->exec_size);
2341          }
2342          inst->offset = base_offset;
2343          inst->mlen = 2;
2344       }
2345
2346       if (type_sz(dst.type) == 8) {
2347          shuffle_from_32bit_read(bld,
2348                                  offset(dst, bld, iter * 2),
2349                                  retype(tmp_dst, BRW_REGISTER_TYPE_D),
2350                                  0,
2351                                  num_components);
2352       }
2353
2354       if (num_iterations > 1) {
2355          num_components = orig_num_components - 2;
2356          if(nir_src_is_const(offset_src)) {
2357             base_offset++;
2358          } else {
2359             fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2360             bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2361             indirect_offset = new_indirect;
2362          }
2363       }
2364    }
2365 }
2366
2367 fs_reg
2368 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2369 {
2370    nir_src *offset_src = nir_get_io_offset_src(instr);
2371
2372    if (nir_src_is_const(*offset_src)) {
2373       /* The only constant offset we should find is 0.  brw_nir.c's
2374        * add_const_offset_to_base() will fold other constant offsets
2375        * into instr->const_index[0].
2376        */
2377       assert(nir_src_as_uint(*offset_src) == 0);
2378       return fs_reg();
2379    }
2380
2381    return get_nir_src(*offset_src);
2382 }
2383
2384 void
2385 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2386                                   nir_intrinsic_instr *instr)
2387 {
2388    assert(stage == MESA_SHADER_VERTEX);
2389
2390    fs_reg dest;
2391    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2392       dest = get_nir_dest(instr->dest);
2393
2394    switch (instr->intrinsic) {
2395    case nir_intrinsic_load_vertex_id:
2396    case nir_intrinsic_load_base_vertex:
2397       unreachable("should be lowered by nir_lower_system_values()");
2398
2399    case nir_intrinsic_load_input: {
2400       fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2401       unsigned first_component = nir_intrinsic_component(instr);
2402       unsigned num_components = instr->num_components;
2403
2404       src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2405
2406       if (type_sz(dest.type) == 8)
2407          first_component /= 2;
2408
2409       /* For 16-bit support maybe a temporary will be needed to copy from
2410        * the ATTR file.
2411        */
2412       shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
2413                               first_component, num_components);
2414       break;
2415    }
2416
2417    case nir_intrinsic_load_vertex_id_zero_base:
2418    case nir_intrinsic_load_instance_id:
2419    case nir_intrinsic_load_base_instance:
2420    case nir_intrinsic_load_draw_id:
2421    case nir_intrinsic_load_first_vertex:
2422    case nir_intrinsic_load_is_indexed_draw:
2423       unreachable("lowered by brw_nir_lower_vs_inputs");
2424
2425    default:
2426       nir_emit_intrinsic(bld, instr);
2427       break;
2428    }
2429 }
2430
2431 void
2432 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2433                                    nir_intrinsic_instr *instr)
2434 {
2435    assert(stage == MESA_SHADER_TESS_CTRL);
2436    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2437    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2438
2439    fs_reg dst;
2440    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2441       dst = get_nir_dest(instr->dest);
2442
2443    switch (instr->intrinsic) {
2444    case nir_intrinsic_load_primitive_id:
2445       bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2446       break;
2447    case nir_intrinsic_load_invocation_id:
2448       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2449       break;
2450    case nir_intrinsic_load_patch_vertices_in:
2451       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2452               brw_imm_d(tcs_key->input_vertices));
2453       break;
2454
2455    case nir_intrinsic_barrier: {
2456       if (tcs_prog_data->instances == 1)
2457          break;
2458
2459       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2460       fs_reg m0_2 = component(m0, 2);
2461
2462       const fs_builder chanbld = bld.exec_all().group(1, 0);
2463
2464       /* Zero the message header */
2465       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2466
2467       if (devinfo->gen < 11) {
2468          /* Copy "Barrier ID" from r0.2, bits 16:13 */
2469          chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2470                      brw_imm_ud(INTEL_MASK(16, 13)));
2471
2472          /* Shift it up to bits 27:24. */
2473          chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2474       } else {
2475          chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2476                      brw_imm_ud(INTEL_MASK(30, 24)));
2477       }
2478
2479       /* Set the Barrier Count and the enable bit */
2480       if (devinfo->gen < 11) {
2481          chanbld.OR(m0_2, m0_2,
2482                     brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2483       } else {
2484          chanbld.OR(m0_2, m0_2,
2485                     brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2486       }
2487
2488       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2489       break;
2490    }
2491
2492    case nir_intrinsic_load_input:
2493       unreachable("nir_lower_io should never give us these.");
2494       break;
2495
2496    case nir_intrinsic_load_per_vertex_input: {
2497       fs_reg indirect_offset = get_indirect_offset(instr);
2498       unsigned imm_offset = instr->const_index[0];
2499
2500       const nir_src &vertex_src = instr->src[0];
2501
2502       fs_inst *inst;
2503
2504       fs_reg icp_handle;
2505
2506       if (nir_src_is_const(vertex_src)) {
2507          /* Emit a MOV to resolve <0,1,0> regioning. */
2508          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2509          unsigned vertex = nir_src_as_uint(vertex_src);
2510          bld.MOV(icp_handle,
2511                  retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2512                         BRW_REGISTER_TYPE_UD));
2513       } else if (tcs_prog_data->instances == 1 &&
2514                  vertex_src.is_ssa &&
2515                  vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2516                  nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2517          /* For the common case of only 1 instance, an array index of
2518           * gl_InvocationID means reading g1.  Skip all the indirect work.
2519           */
2520          icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2521       } else {
2522          /* The vertex index is non-constant.  We need to use indirect
2523           * addressing to fetch the proper URB handle.
2524           */
2525          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2526
2527          /* Each ICP handle is a single DWord (4 bytes) */
2528          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2529          bld.SHL(vertex_offset_bytes,
2530                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2531                  brw_imm_ud(2u));
2532
2533          /* Start at g1.  We might read up to 4 registers. */
2534          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2535                   retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2536                   brw_imm_ud(4 * REG_SIZE));
2537       }
2538
2539       /* We can only read two double components with each URB read, so
2540        * we send two read messages in that case, each one loading up to
2541        * two double components.
2542        */
2543       unsigned num_iterations = 1;
2544       unsigned num_components = instr->num_components;
2545       unsigned first_component = nir_intrinsic_component(instr);
2546       fs_reg orig_dst = dst;
2547       if (type_sz(dst.type) == 8) {
2548          first_component = first_component / 2;
2549          if (instr->num_components > 2) {
2550             num_iterations = 2;
2551             num_components = 2;
2552          }
2553
2554          fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2555          dst = tmp;
2556       }
2557
2558       for (unsigned iter = 0; iter < num_iterations; iter++) {
2559          if (indirect_offset.file == BAD_FILE) {
2560             /* Constant indexing - use global offset. */
2561             if (first_component != 0) {
2562                unsigned read_components = num_components + first_component;
2563                fs_reg tmp = bld.vgrf(dst.type, read_components);
2564                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2565                for (unsigned i = 0; i < num_components; i++) {
2566                   bld.MOV(offset(dst, bld, i),
2567                           offset(tmp, bld, i + first_component));
2568                }
2569             } else {
2570                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2571             }
2572             inst->offset = imm_offset;
2573             inst->mlen = 1;
2574          } else {
2575             /* Indirect indexing - use per-slot offsets as well. */
2576             const fs_reg srcs[] = { icp_handle, indirect_offset };
2577             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2578             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2579             if (first_component != 0) {
2580                unsigned read_components = num_components + first_component;
2581                fs_reg tmp = bld.vgrf(dst.type, read_components);
2582                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2583                                payload);
2584                for (unsigned i = 0; i < num_components; i++) {
2585                   bld.MOV(offset(dst, bld, i),
2586                           offset(tmp, bld, i + first_component));
2587                }
2588             } else {
2589                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2590                                payload);
2591             }
2592             inst->offset = imm_offset;
2593             inst->mlen = 2;
2594          }
2595          inst->size_written = (num_components + first_component) *
2596                               inst->dst.component_size(inst->exec_size);
2597
2598          /* If we are reading 64-bit data using 32-bit read messages we need
2599           * build proper 64-bit data elements by shuffling the low and high
2600           * 32-bit components around like we do for other things like UBOs
2601           * or SSBOs.
2602           */
2603          if (type_sz(dst.type) == 8) {
2604             shuffle_from_32bit_read(bld,
2605                                     offset(orig_dst, bld, iter * 2),
2606                                     retype(dst, BRW_REGISTER_TYPE_D),
2607                                     0, num_components);
2608          }
2609
2610          /* Copy the temporary to the destination to deal with writemasking.
2611           *
2612           * Also attempt to deal with gl_PointSize being in the .w component.
2613           */
2614          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2615             assert(type_sz(dst.type) < 8);
2616             inst->dst = bld.vgrf(dst.type, 4);
2617             inst->size_written = 4 * REG_SIZE;
2618             bld.MOV(dst, offset(inst->dst, bld, 3));
2619          }
2620
2621          /* If we are loading double data and we need a second read message
2622           * adjust the write offset
2623           */
2624          if (num_iterations > 1) {
2625             num_components = instr->num_components - 2;
2626             imm_offset++;
2627          }
2628       }
2629       break;
2630    }
2631
2632    case nir_intrinsic_load_output:
2633    case nir_intrinsic_load_per_vertex_output: {
2634       fs_reg indirect_offset = get_indirect_offset(instr);
2635       unsigned imm_offset = instr->const_index[0];
2636       unsigned first_component = nir_intrinsic_component(instr);
2637
2638       fs_inst *inst;
2639       if (indirect_offset.file == BAD_FILE) {
2640          /* Replicate the patch handle to all enabled channels */
2641          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2642          bld.MOV(patch_handle,
2643                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2644
2645          {
2646             if (first_component != 0) {
2647                unsigned read_components =
2648                   instr->num_components + first_component;
2649                fs_reg tmp = bld.vgrf(dst.type, read_components);
2650                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2651                                patch_handle);
2652                inst->size_written = read_components * REG_SIZE;
2653                for (unsigned i = 0; i < instr->num_components; i++) {
2654                   bld.MOV(offset(dst, bld, i),
2655                           offset(tmp, bld, i + first_component));
2656                }
2657             } else {
2658                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2659                                patch_handle);
2660                inst->size_written = instr->num_components * REG_SIZE;
2661             }
2662             inst->offset = imm_offset;
2663             inst->mlen = 1;
2664          }
2665       } else {
2666          /* Indirect indexing - use per-slot offsets as well. */
2667          const fs_reg srcs[] = {
2668             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2669             indirect_offset
2670          };
2671          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2672          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2673          if (first_component != 0) {
2674             unsigned read_components =
2675                instr->num_components + first_component;
2676             fs_reg tmp = bld.vgrf(dst.type, read_components);
2677             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2678                             payload);
2679             inst->size_written = read_components * REG_SIZE;
2680             for (unsigned i = 0; i < instr->num_components; i++) {
2681                bld.MOV(offset(dst, bld, i),
2682                        offset(tmp, bld, i + first_component));
2683             }
2684          } else {
2685             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2686                             payload);
2687             inst->size_written = instr->num_components * REG_SIZE;
2688          }
2689          inst->offset = imm_offset;
2690          inst->mlen = 2;
2691       }
2692       break;
2693    }
2694
2695    case nir_intrinsic_store_output:
2696    case nir_intrinsic_store_per_vertex_output: {
2697       fs_reg value = get_nir_src(instr->src[0]);
2698       bool is_64bit = (instr->src[0].is_ssa ?
2699          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2700       fs_reg indirect_offset = get_indirect_offset(instr);
2701       unsigned imm_offset = instr->const_index[0];
2702       unsigned mask = instr->const_index[1];
2703       unsigned header_regs = 0;
2704       fs_reg srcs[7];
2705       srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2706
2707       if (indirect_offset.file != BAD_FILE) {
2708          srcs[header_regs++] = indirect_offset;
2709       }
2710
2711       if (mask == 0)
2712          break;
2713
2714       unsigned num_components = util_last_bit(mask);
2715       enum opcode opcode;
2716
2717       /* We can only pack two 64-bit components in a single message, so send
2718        * 2 messages if we have more components
2719        */
2720       unsigned num_iterations = 1;
2721       unsigned iter_components = num_components;
2722       unsigned first_component = nir_intrinsic_component(instr);
2723       if (is_64bit) {
2724          first_component = first_component / 2;
2725          if (instr->num_components > 2) {
2726             num_iterations = 2;
2727             iter_components = 2;
2728          }
2729       }
2730
2731       mask = mask << first_component;
2732
2733       for (unsigned iter = 0; iter < num_iterations; iter++) {
2734          if (!is_64bit && mask != WRITEMASK_XYZW) {
2735             srcs[header_regs++] = brw_imm_ud(mask << 16);
2736             opcode = indirect_offset.file != BAD_FILE ?
2737                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2738                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2739          } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2740             /* Expand the 64-bit mask to 32-bit channels. We only handle
2741              * two channels in each iteration, so we only care about X/Y.
2742              */
2743             unsigned mask32 = 0;
2744             if (mask & WRITEMASK_X)
2745                mask32 |= WRITEMASK_XY;
2746             if (mask & WRITEMASK_Y)
2747                mask32 |= WRITEMASK_ZW;
2748
2749             /* If the mask does not include any of the channels X or Y there
2750              * is nothing to do in this iteration. Move on to the next couple
2751              * of 64-bit channels.
2752              */
2753             if (!mask32) {
2754                mask >>= 2;
2755                imm_offset++;
2756                continue;
2757             }
2758
2759             srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2760             opcode = indirect_offset.file != BAD_FILE ?
2761                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2762                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2763          } else {
2764             opcode = indirect_offset.file != BAD_FILE ?
2765                SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2766                SHADER_OPCODE_URB_WRITE_SIMD8;
2767          }
2768
2769          for (unsigned i = 0; i < iter_components; i++) {
2770             if (!(mask & (1 << (i + first_component))))
2771                continue;
2772
2773             if (!is_64bit) {
2774                srcs[header_regs + i + first_component] = offset(value, bld, i);
2775             } else {
2776                /* We need to shuffle the 64-bit data to match the layout
2777                 * expected by our 32-bit URB write messages. We use a temporary
2778                 * for that.
2779                 */
2780                unsigned channel = iter * 2 + i;
2781                fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
2782
2783                srcs[header_regs + (i + first_component) * 2] = dest;
2784                srcs[header_regs + (i + first_component) * 2 + 1] =
2785                   offset(dest, bld, 1);
2786             }
2787          }
2788
2789          unsigned mlen =
2790             header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2791             (is_64bit ? 2 * first_component : first_component);
2792          fs_reg payload =
2793             bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2794          bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2795
2796          fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2797          inst->offset = imm_offset;
2798          inst->mlen = mlen;
2799
2800          /* If this is a 64-bit attribute, select the next two 64-bit channels
2801           * to be handled in the next iteration.
2802           */
2803          if (is_64bit) {
2804             mask >>= 2;
2805             imm_offset++;
2806          }
2807       }
2808       break;
2809    }
2810
2811    default:
2812       nir_emit_intrinsic(bld, instr);
2813       break;
2814    }
2815 }
2816
2817 void
2818 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2819                                    nir_intrinsic_instr *instr)
2820 {
2821    assert(stage == MESA_SHADER_TESS_EVAL);
2822    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2823
2824    fs_reg dest;
2825    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2826       dest = get_nir_dest(instr->dest);
2827
2828    switch (instr->intrinsic) {
2829    case nir_intrinsic_load_primitive_id:
2830       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2831       break;
2832    case nir_intrinsic_load_tess_coord:
2833       /* gl_TessCoord is part of the payload in g1-3 */
2834       for (unsigned i = 0; i < 3; i++) {
2835          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2836       }
2837       break;
2838
2839    case nir_intrinsic_load_input:
2840    case nir_intrinsic_load_per_vertex_input: {
2841       fs_reg indirect_offset = get_indirect_offset(instr);
2842       unsigned imm_offset = instr->const_index[0];
2843       unsigned first_component = nir_intrinsic_component(instr);
2844
2845       if (type_sz(dest.type) == 8) {
2846          first_component = first_component / 2;
2847       }
2848
2849       fs_inst *inst;
2850       if (indirect_offset.file == BAD_FILE) {
2851          /* Arbitrarily only push up to 32 vec4 slots worth of data,
2852           * which is 16 registers (since each holds 2 vec4 slots).
2853           */
2854          unsigned slot_count = 1;
2855          if (type_sz(dest.type) == 8 && instr->num_components > 2)
2856             slot_count++;
2857
2858          const unsigned max_push_slots = 32;
2859          if (imm_offset + slot_count <= max_push_slots) {
2860             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2861             for (int i = 0; i < instr->num_components; i++) {
2862                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2863                   i + first_component;
2864                bld.MOV(offset(dest, bld, i), component(src, comp));
2865             }
2866
2867             tes_prog_data->base.urb_read_length =
2868                MAX2(tes_prog_data->base.urb_read_length,
2869                     DIV_ROUND_UP(imm_offset + slot_count, 2));
2870          } else {
2871             /* Replicate the patch handle to all enabled channels */
2872             const fs_reg srcs[] = {
2873                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2874             };
2875             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2876             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2877
2878             if (first_component != 0) {
2879                unsigned read_components =
2880                   instr->num_components + first_component;
2881                fs_reg tmp = bld.vgrf(dest.type, read_components);
2882                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2883                                patch_handle);
2884                inst->size_written = read_components * REG_SIZE;
2885                for (unsigned i = 0; i < instr->num_components; i++) {
2886                   bld.MOV(offset(dest, bld, i),
2887                           offset(tmp, bld, i + first_component));
2888                }
2889             } else {
2890                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2891                                patch_handle);
2892                inst->size_written = instr->num_components * REG_SIZE;
2893             }
2894             inst->mlen = 1;
2895             inst->offset = imm_offset;
2896          }
2897       } else {
2898          /* Indirect indexing - use per-slot offsets as well. */
2899
2900          /* We can only read two double components with each URB read, so
2901           * we send two read messages in that case, each one loading up to
2902           * two double components.
2903           */
2904          unsigned num_iterations = 1;
2905          unsigned num_components = instr->num_components;
2906          fs_reg orig_dest = dest;
2907          if (type_sz(dest.type) == 8) {
2908             if (instr->num_components > 2) {
2909                num_iterations = 2;
2910                num_components = 2;
2911             }
2912             fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2913             dest = tmp;
2914          }
2915
2916          for (unsigned iter = 0; iter < num_iterations; iter++) {
2917             const fs_reg srcs[] = {
2918                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2919                indirect_offset
2920             };
2921             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2922             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2923
2924             if (first_component != 0) {
2925                unsigned read_components =
2926                    num_components + first_component;
2927                fs_reg tmp = bld.vgrf(dest.type, read_components);
2928                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2929                                payload);
2930                for (unsigned i = 0; i < num_components; i++) {
2931                   bld.MOV(offset(dest, bld, i),
2932                           offset(tmp, bld, i + first_component));
2933                }
2934             } else {
2935                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2936                                payload);
2937             }
2938             inst->mlen = 2;
2939             inst->offset = imm_offset;
2940             inst->size_written = (num_components + first_component) *
2941                                  inst->dst.component_size(inst->exec_size);
2942
2943             /* If we are reading 64-bit data using 32-bit read messages we need
2944              * build proper 64-bit data elements by shuffling the low and high
2945              * 32-bit components around like we do for other things like UBOs
2946              * or SSBOs.
2947              */
2948             if (type_sz(dest.type) == 8) {
2949                shuffle_from_32bit_read(bld,
2950                                        offset(orig_dest, bld, iter * 2),
2951                                        retype(dest, BRW_REGISTER_TYPE_D),
2952                                        0, num_components);
2953             }
2954
2955             /* If we are loading double data and we need a second read message
2956              * adjust the offset
2957              */
2958             if (num_iterations > 1) {
2959                num_components = instr->num_components - 2;
2960                imm_offset++;
2961             }
2962          }
2963       }
2964       break;
2965    }
2966    default:
2967       nir_emit_intrinsic(bld, instr);
2968       break;
2969    }
2970 }
2971
2972 void
2973 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
2974                                   nir_intrinsic_instr *instr)
2975 {
2976    assert(stage == MESA_SHADER_GEOMETRY);
2977    fs_reg indirect_offset;
2978
2979    fs_reg dest;
2980    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2981       dest = get_nir_dest(instr->dest);
2982
2983    switch (instr->intrinsic) {
2984    case nir_intrinsic_load_primitive_id:
2985       assert(stage == MESA_SHADER_GEOMETRY);
2986       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
2987       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
2988               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
2989       break;
2990
2991    case nir_intrinsic_load_input:
2992       unreachable("load_input intrinsics are invalid for the GS stage");
2993
2994    case nir_intrinsic_load_per_vertex_input:
2995       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
2996                          instr->src[1], instr->num_components,
2997                          nir_intrinsic_component(instr));
2998       break;
2999
3000    case nir_intrinsic_emit_vertex_with_counter:
3001       emit_gs_vertex(instr->src[0], instr->const_index[0]);
3002       break;
3003
3004    case nir_intrinsic_end_primitive_with_counter:
3005       emit_gs_end_primitive(instr->src[0]);
3006       break;
3007
3008    case nir_intrinsic_set_vertex_count:
3009       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
3010       break;
3011
3012    case nir_intrinsic_load_invocation_id: {
3013       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3014       assert(val.file != BAD_FILE);
3015       dest.type = val.type;
3016       bld.MOV(dest, val);
3017       break;
3018    }
3019
3020    default:
3021       nir_emit_intrinsic(bld, instr);
3022       break;
3023    }
3024 }
3025
3026 /**
3027  * Fetch the current render target layer index.
3028  */
3029 static fs_reg
3030 fetch_render_target_array_index(const fs_builder &bld)
3031 {
3032    if (bld.shader->devinfo->gen >= 6) {
3033       /* The render target array index is provided in the thread payload as
3034        * bits 26:16 of r0.0.
3035        */
3036       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3037       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3038               brw_imm_uw(0x7ff));
3039       return idx;
3040    } else {
3041       /* Pre-SNB we only ever render into the first layer of the framebuffer
3042        * since layered rendering is not implemented.
3043        */
3044       return brw_imm_ud(0);
3045    }
3046 }
3047
3048 /**
3049  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3050  * framebuffer at the current fragment coordinates and sample index.
3051  */
3052 fs_inst *
3053 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3054                                       unsigned target)
3055 {
3056    const struct gen_device_info *devinfo = bld.shader->devinfo;
3057
3058    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3059    const brw_wm_prog_key *wm_key =
3060       reinterpret_cast<const brw_wm_prog_key *>(key);
3061    assert(!wm_key->coherent_fb_fetch);
3062    const struct brw_wm_prog_data *wm_prog_data =
3063       brw_wm_prog_data(stage_prog_data);
3064
3065    /* Calculate the surface index relative to the start of the texture binding
3066     * table block, since that's what the texturing messages expect.
3067     */
3068    const unsigned surface = target +
3069       wm_prog_data->binding_table.render_target_read_start -
3070       wm_prog_data->base.binding_table.texture_start;
3071
3072    /* Calculate the fragment coordinates. */
3073    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3074    bld.MOV(offset(coords, bld, 0), pixel_x);
3075    bld.MOV(offset(coords, bld, 1), pixel_y);
3076    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3077
3078    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3079     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3080     * shouldn't be necessary to recompile based on whether the framebuffer is
3081     * CMS or UMS.
3082     */
3083    if (wm_key->multisample_fbo &&
3084        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3085       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3086
3087    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3088    const fs_reg mcs = wm_key->multisample_fbo ?
3089       emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
3090
3091    /* Use either a normal or a CMS texel fetch message depending on whether
3092     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3093     * message just in case the framebuffer uses 16x multisampling, it should
3094     * be equivalent to the normal CMS fetch for lower multisampling modes.
3095     */
3096    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3097                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3098                      SHADER_OPCODE_TXF_CMS_LOGICAL;
3099
3100    /* Emit the instruction. */
3101    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3102    srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3103    srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3104    srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3105    srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3106    srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(surface);
3107    srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3108    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3109    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3110
3111    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3112    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3113
3114    return inst;
3115 }
3116
3117 /**
3118  * Actual coherent framebuffer read implemented using the native render target
3119  * read message.  Requires SKL+.
3120  */
3121 static fs_inst *
3122 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3123 {
3124    assert(bld.shader->devinfo->gen >= 9);
3125    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3126    inst->target = target;
3127    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3128
3129    return inst;
3130 }
3131
3132 static fs_reg
3133 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3134 {
3135    if (n && regs[0].file != BAD_FILE) {
3136       return regs[0];
3137
3138    } else {
3139       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3140
3141       for (unsigned i = 0; i < n; i++)
3142          regs[i] = tmp;
3143
3144       return tmp;
3145    }
3146 }
3147
3148 static fs_reg
3149 alloc_frag_output(fs_visitor *v, unsigned location)
3150 {
3151    assert(v->stage == MESA_SHADER_FRAGMENT);
3152    const brw_wm_prog_key *const key =
3153       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3154    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3155    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3156
3157    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3158       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3159
3160    else if (l == FRAG_RESULT_COLOR)
3161       return alloc_temporary(v->bld, 4, v->outputs,
3162                              MAX2(key->nr_color_regions, 1));
3163
3164    else if (l == FRAG_RESULT_DEPTH)
3165       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3166
3167    else if (l == FRAG_RESULT_STENCIL)
3168       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3169
3170    else if (l == FRAG_RESULT_SAMPLE_MASK)
3171       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3172
3173    else if (l >= FRAG_RESULT_DATA0 &&
3174             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3175       return alloc_temporary(v->bld, 4,
3176                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3177
3178    else
3179       unreachable("Invalid location");
3180 }
3181
3182 void
3183 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3184                                   nir_intrinsic_instr *instr)
3185 {
3186    assert(stage == MESA_SHADER_FRAGMENT);
3187
3188    fs_reg dest;
3189    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3190       dest = get_nir_dest(instr->dest);
3191
3192    switch (instr->intrinsic) {
3193    case nir_intrinsic_load_front_face:
3194       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3195               *emit_frontfacing_interpolation());
3196       break;
3197
3198    case nir_intrinsic_load_sample_pos: {
3199       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3200       assert(sample_pos.file != BAD_FILE);
3201       dest.type = sample_pos.type;
3202       bld.MOV(dest, sample_pos);
3203       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3204       break;
3205    }
3206
3207    case nir_intrinsic_load_layer_id:
3208       dest.type = BRW_REGISTER_TYPE_UD;
3209       bld.MOV(dest, fetch_render_target_array_index(bld));
3210       break;
3211
3212    case nir_intrinsic_load_helper_invocation:
3213    case nir_intrinsic_load_sample_mask_in:
3214    case nir_intrinsic_load_sample_id: {
3215       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3216       fs_reg val = nir_system_values[sv];
3217       assert(val.file != BAD_FILE);
3218       dest.type = val.type;
3219       bld.MOV(dest, val);
3220       break;
3221    }
3222
3223    case nir_intrinsic_store_output: {
3224       const fs_reg src = get_nir_src(instr->src[0]);
3225       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3226       const unsigned location = nir_intrinsic_base(instr) +
3227          SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3228       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3229                                      src.type);
3230
3231       for (unsigned j = 0; j < instr->num_components; j++)
3232          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3233                  offset(src, bld, j));
3234
3235       break;
3236    }
3237
3238    case nir_intrinsic_load_output: {
3239       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3240                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3241       assert(l >= FRAG_RESULT_DATA0);
3242       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3243       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3244       const fs_reg tmp = bld.vgrf(dest.type, 4);
3245
3246       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3247          emit_coherent_fb_read(bld, tmp, target);
3248       else
3249          emit_non_coherent_fb_read(bld, tmp, target);
3250
3251       for (unsigned j = 0; j < instr->num_components; j++) {
3252          bld.MOV(offset(dest, bld, j),
3253                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3254       }
3255
3256       break;
3257    }
3258
3259    case nir_intrinsic_discard:
3260    case nir_intrinsic_discard_if: {
3261       /* We track our discarded pixels in f0.1.  By predicating on it, we can
3262        * update just the flag bits that aren't yet discarded.  If there's no
3263        * condition, we emit a CMP of g0 != g0, so all currently executing
3264        * channels will get turned off.
3265        */
3266       fs_inst *cmp;
3267       if (instr->intrinsic == nir_intrinsic_discard_if) {
3268          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3269                        brw_imm_d(0), BRW_CONDITIONAL_Z);
3270       } else {
3271          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3272                                        BRW_REGISTER_TYPE_UW));
3273          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3274       }
3275       cmp->predicate = BRW_PREDICATE_NORMAL;
3276       cmp->flag_subreg = 1;
3277
3278       if (devinfo->gen >= 6) {
3279          emit_discard_jump();
3280       }
3281
3282       limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode.");
3283       break;
3284    }
3285
3286    case nir_intrinsic_load_input: {
3287       /* load_input is only used for flat inputs */
3288       unsigned base = nir_intrinsic_base(instr);
3289       unsigned comp = nir_intrinsic_component(instr);
3290       unsigned num_components = instr->num_components;
3291       fs_reg orig_dest = dest;
3292       enum brw_reg_type type = dest.type;
3293
3294       /* Special case fields in the VUE header */
3295       if (base == VARYING_SLOT_LAYER)
3296          comp = 1;
3297       else if (base == VARYING_SLOT_VIEWPORT)
3298          comp = 2;
3299
3300       if (nir_dest_bit_size(instr->dest) == 64) {
3301          /* const_index is in 32-bit type size units that could not be aligned
3302           * with DF. We need to read the double vector as if it was a float
3303           * vector of twice the number of components to fetch the right data.
3304           */
3305          type = BRW_REGISTER_TYPE_F;
3306          num_components *= 2;
3307          dest = bld.vgrf(type, num_components);
3308       }
3309
3310       for (unsigned int i = 0; i < num_components; i++) {
3311          bld.MOV(offset(retype(dest, type), bld, i),
3312                  retype(component(interp_reg(base, comp + i), 3), type));
3313       }
3314
3315       if (nir_dest_bit_size(instr->dest) == 64) {
3316          shuffle_from_32bit_read(bld, orig_dest, dest, 0,
3317                                  instr->num_components);
3318       }
3319       break;
3320    }
3321
3322    case nir_intrinsic_load_barycentric_pixel:
3323    case nir_intrinsic_load_barycentric_centroid:
3324    case nir_intrinsic_load_barycentric_sample:
3325       /* Do nothing - load_interpolated_input handling will handle it later. */
3326       break;
3327
3328    case nir_intrinsic_load_barycentric_at_sample: {
3329       const glsl_interp_mode interpolation =
3330          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3331
3332       if (nir_src_is_const(instr->src[0])) {
3333          unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3334
3335          emit_pixel_interpolater_send(bld,
3336                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3337                                       dest,
3338                                       fs_reg(), /* src */
3339                                       brw_imm_ud(msg_data),
3340                                       interpolation);
3341       } else {
3342          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3343                                           BRW_REGISTER_TYPE_UD);
3344
3345          if (nir_src_is_dynamically_uniform(instr->src[0])) {
3346             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3347             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3348             bld.exec_all().group(1, 0)
3349                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3350             emit_pixel_interpolater_send(bld,
3351                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3352                                          dest,
3353                                          fs_reg(), /* src */
3354                                          msg_data,
3355                                          interpolation);
3356          } else {
3357             /* Make a loop that sends a message to the pixel interpolater
3358              * for the sample number in each live channel. If there are
3359              * multiple channels with the same sample number then these
3360              * will be handled simultaneously with a single interation of
3361              * the loop.
3362              */
3363             bld.emit(BRW_OPCODE_DO);
3364
3365             /* Get the next live sample number into sample_id_reg */
3366             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3367
3368             /* Set the flag register so that we can perform the send
3369              * message on all channels that have the same sample number
3370              */
3371             bld.CMP(bld.null_reg_ud(),
3372                     sample_src, sample_id,
3373                     BRW_CONDITIONAL_EQ);
3374             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3375             bld.exec_all().group(1, 0)
3376                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3377             fs_inst *inst =
3378                emit_pixel_interpolater_send(bld,
3379                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3380                                             dest,
3381                                             fs_reg(), /* src */
3382                                             component(msg_data, 0),
3383                                             interpolation);
3384             set_predicate(BRW_PREDICATE_NORMAL, inst);
3385
3386             /* Continue the loop if there are any live channels left */
3387             set_predicate_inv(BRW_PREDICATE_NORMAL,
3388                               true, /* inverse */
3389                               bld.emit(BRW_OPCODE_WHILE));
3390          }
3391       }
3392       break;
3393    }
3394
3395    case nir_intrinsic_load_barycentric_at_offset: {
3396       const glsl_interp_mode interpolation =
3397          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3398
3399       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3400
3401       if (const_offset) {
3402          assert(nir_src_bit_size(instr->src[0]) == 32);
3403          unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf;
3404          unsigned off_y = MIN2((int)(const_offset[1].f32 * 16), 7) & 0xf;
3405
3406          emit_pixel_interpolater_send(bld,
3407                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3408                                       dest,
3409                                       fs_reg(), /* src */
3410                                       brw_imm_ud(off_x | (off_y << 4)),
3411                                       interpolation);
3412       } else {
3413          fs_reg src = vgrf(glsl_type::ivec2_type);
3414          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3415                                     BRW_REGISTER_TYPE_F);
3416          for (int i = 0; i < 2; i++) {
3417             fs_reg temp = vgrf(glsl_type::float_type);
3418             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3419             fs_reg itemp = vgrf(glsl_type::int_type);
3420             /* float to int */
3421             bld.MOV(itemp, temp);
3422
3423             /* Clamp the upper end of the range to +7/16.
3424              * ARB_gpu_shader5 requires that we support a maximum offset
3425              * of +0.5, which isn't representable in a S0.4 value -- if
3426              * we didn't clamp it, we'd end up with -8/16, which is the
3427              * opposite of what the shader author wanted.
3428              *
3429              * This is legal due to ARB_gpu_shader5's quantization
3430              * rules:
3431              *
3432              * "Not all values of <offset> may be supported; x and y
3433              * offsets may be rounded to fixed-point values with the
3434              * number of fraction bits given by the
3435              * implementation-dependent constant
3436              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3437              */
3438             set_condmod(BRW_CONDITIONAL_L,
3439                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3440          }
3441
3442          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3443          emit_pixel_interpolater_send(bld,
3444                                       opcode,
3445                                       dest,
3446                                       src,
3447                                       brw_imm_ud(0u),
3448                                       interpolation);
3449       }
3450       break;
3451    }
3452
3453    case nir_intrinsic_load_interpolated_input: {
3454       if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3455          emit_fragcoord_interpolation(dest);
3456          break;
3457       }
3458
3459       assert(instr->src[0].ssa &&
3460              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3461       nir_intrinsic_instr *bary_intrinsic =
3462          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3463       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3464       enum glsl_interp_mode interp_mode =
3465          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3466       fs_reg dst_xy;
3467
3468       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3469           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3470          /* Use the result of the PI message */
3471          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3472       } else {
3473          /* Use the delta_xy values computed from the payload */
3474          enum brw_barycentric_mode bary =
3475             brw_barycentric_mode(interp_mode, bary_intrin);
3476
3477          dst_xy = this->delta_xy[bary];
3478       }
3479
3480       for (unsigned int i = 0; i < instr->num_components; i++) {
3481          fs_reg interp =
3482             component(interp_reg(nir_intrinsic_base(instr),
3483                                  nir_intrinsic_component(instr) + i), 0);
3484          interp.type = BRW_REGISTER_TYPE_F;
3485          dest.type = BRW_REGISTER_TYPE_F;
3486
3487          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3488             fs_reg tmp = vgrf(glsl_type::float_type);
3489             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3490             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3491          } else {
3492             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3493          }
3494       }
3495       break;
3496    }
3497
3498    default:
3499       nir_emit_intrinsic(bld, instr);
3500       break;
3501    }
3502 }
3503
3504 static int
3505 get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
3506 {
3507    if (nir_src_is_const(instr->src[src])) {
3508       int64_t add_val = nir_src_as_int(instr->src[src]);
3509       if (add_val == 1)
3510          return BRW_AOP_INC;
3511       else if (add_val == -1)
3512          return BRW_AOP_DEC;
3513    }
3514
3515    return BRW_AOP_ADD;
3516 }
3517
3518 void
3519 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3520                                   nir_intrinsic_instr *instr)
3521 {
3522    assert(stage == MESA_SHADER_COMPUTE);
3523    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3524
3525    fs_reg dest;
3526    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3527       dest = get_nir_dest(instr->dest);
3528
3529    switch (instr->intrinsic) {
3530    case nir_intrinsic_barrier:
3531       emit_barrier();
3532       cs_prog_data->uses_barrier = true;
3533       break;
3534
3535    case nir_intrinsic_load_subgroup_id:
3536       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3537       break;
3538
3539    case nir_intrinsic_load_local_invocation_id:
3540    case nir_intrinsic_load_work_group_id: {
3541       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3542       fs_reg val = nir_system_values[sv];
3543       assert(val.file != BAD_FILE);
3544       dest.type = val.type;
3545       for (unsigned i = 0; i < 3; i++)
3546          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3547       break;
3548    }
3549
3550    case nir_intrinsic_load_num_work_groups: {
3551       const unsigned surface =
3552          cs_prog_data->binding_table.work_groups_start;
3553
3554       cs_prog_data->uses_num_work_groups = true;
3555
3556       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3557       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
3558       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3559       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */
3560
3561       /* Read the 3 GLuint components of gl_NumWorkGroups */
3562       for (unsigned i = 0; i < 3; i++) {
3563          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2);
3564          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3565                   offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS);
3566       }
3567       break;
3568    }
3569
3570    case nir_intrinsic_shared_atomic_add:
3571       nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
3572       break;
3573    case nir_intrinsic_shared_atomic_imin:
3574       nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3575       break;
3576    case nir_intrinsic_shared_atomic_umin:
3577       nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3578       break;
3579    case nir_intrinsic_shared_atomic_imax:
3580       nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3581       break;
3582    case nir_intrinsic_shared_atomic_umax:
3583       nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3584       break;
3585    case nir_intrinsic_shared_atomic_and:
3586       nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3587       break;
3588    case nir_intrinsic_shared_atomic_or:
3589       nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3590       break;
3591    case nir_intrinsic_shared_atomic_xor:
3592       nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3593       break;
3594    case nir_intrinsic_shared_atomic_exchange:
3595       nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3596       break;
3597    case nir_intrinsic_shared_atomic_comp_swap:
3598       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3599       break;
3600    case nir_intrinsic_shared_atomic_fmin:
3601       nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr);
3602       break;
3603    case nir_intrinsic_shared_atomic_fmax:
3604       nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr);
3605       break;
3606    case nir_intrinsic_shared_atomic_fcomp_swap:
3607       nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr);
3608       break;
3609
3610    case nir_intrinsic_load_shared: {
3611       assert(devinfo->gen >= 7);
3612       assert(stage == MESA_SHADER_COMPUTE);
3613
3614       const unsigned bit_size = nir_dest_bit_size(instr->dest);
3615       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3616       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3617       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
3618       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3619
3620       /* Make dest unsigned because that's what the temporary will be */
3621       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3622
3623       /* Read the vector */
3624       if (nir_intrinsic_align(instr) >= 4) {
3625          assert(nir_dest_bit_size(instr->dest) == 32);
3626          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3627          fs_inst *inst =
3628             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3629                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3630          inst->size_written = instr->num_components * dispatch_width * 4;
3631       } else {
3632          assert(nir_dest_bit_size(instr->dest) <= 32);
3633          assert(nir_dest_num_components(instr->dest) == 1);
3634          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3635
3636          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
3637          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
3638                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
3639          bld.MOV(dest, read_result);
3640       }
3641       break;
3642    }
3643
3644    case nir_intrinsic_store_shared: {
3645       assert(devinfo->gen >= 7);
3646       assert(stage == MESA_SHADER_COMPUTE);
3647
3648       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3649       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3650       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3651       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3652       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3653
3654       fs_reg data = get_nir_src(instr->src[0]);
3655       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3656
3657       assert(nir_intrinsic_write_mask(instr) ==
3658              (1u << instr->num_components) - 1);
3659       if (nir_intrinsic_align(instr) >= 4) {
3660          assert(nir_src_bit_size(instr->src[0]) == 32);
3661          assert(nir_src_num_components(instr->src[0]) <= 4);
3662          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3663          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3664          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3665                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3666       } else {
3667          assert(nir_src_bit_size(instr->src[0]) <= 32);
3668          assert(nir_src_num_components(instr->src[0]) == 1);
3669          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3670
3671          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3672          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
3673
3674          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
3675                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3676       }
3677       break;
3678    }
3679
3680    default:
3681       nir_emit_intrinsic(bld, instr);
3682       break;
3683    }
3684 }
3685
3686 static fs_reg
3687 brw_nir_reduction_op_identity(const fs_builder &bld,
3688                               nir_op op, brw_reg_type type)
3689 {
3690    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
3691    switch (type_sz(type)) {
3692    case 2:
3693       assert(type != BRW_REGISTER_TYPE_HF);
3694       return retype(brw_imm_uw(value.u16), type);
3695    case 4:
3696       return retype(brw_imm_ud(value.u32), type);
3697    case 8:
3698       if (type == BRW_REGISTER_TYPE_DF)
3699          return setup_imm_df(bld, value.f64);
3700       else
3701          return retype(brw_imm_u64(value.u64), type);
3702    default:
3703       unreachable("Invalid type size");
3704    }
3705 }
3706
3707 static opcode
3708 brw_op_for_nir_reduction_op(nir_op op)
3709 {
3710    switch (op) {
3711    case nir_op_iadd: return BRW_OPCODE_ADD;
3712    case nir_op_fadd: return BRW_OPCODE_ADD;
3713    case nir_op_imul: return BRW_OPCODE_MUL;
3714    case nir_op_fmul: return BRW_OPCODE_MUL;
3715    case nir_op_imin: return BRW_OPCODE_SEL;
3716    case nir_op_umin: return BRW_OPCODE_SEL;
3717    case nir_op_fmin: return BRW_OPCODE_SEL;
3718    case nir_op_imax: return BRW_OPCODE_SEL;
3719    case nir_op_umax: return BRW_OPCODE_SEL;
3720    case nir_op_fmax: return BRW_OPCODE_SEL;
3721    case nir_op_iand: return BRW_OPCODE_AND;
3722    case nir_op_ior:  return BRW_OPCODE_OR;
3723    case nir_op_ixor: return BRW_OPCODE_XOR;
3724    default:
3725       unreachable("Invalid reduction operation");
3726    }
3727 }
3728
3729 static brw_conditional_mod
3730 brw_cond_mod_for_nir_reduction_op(nir_op op)
3731 {
3732    switch (op) {
3733    case nir_op_iadd: return BRW_CONDITIONAL_NONE;
3734    case nir_op_fadd: return BRW_CONDITIONAL_NONE;
3735    case nir_op_imul: return BRW_CONDITIONAL_NONE;
3736    case nir_op_fmul: return BRW_CONDITIONAL_NONE;
3737    case nir_op_imin: return BRW_CONDITIONAL_L;
3738    case nir_op_umin: return BRW_CONDITIONAL_L;
3739    case nir_op_fmin: return BRW_CONDITIONAL_L;
3740    case nir_op_imax: return BRW_CONDITIONAL_GE;
3741    case nir_op_umax: return BRW_CONDITIONAL_GE;
3742    case nir_op_fmax: return BRW_CONDITIONAL_GE;
3743    case nir_op_iand: return BRW_CONDITIONAL_NONE;
3744    case nir_op_ior:  return BRW_CONDITIONAL_NONE;
3745    case nir_op_ixor: return BRW_CONDITIONAL_NONE;
3746    default:
3747       unreachable("Invalid reduction operation");
3748    }
3749 }
3750
3751 fs_reg
3752 fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
3753                                           nir_intrinsic_instr *instr)
3754 {
3755    fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
3756
3757    if (stage_prog_data->binding_table.image_start > 0) {
3758       if (image.file == BRW_IMMEDIATE_VALUE) {
3759          image.d += stage_prog_data->binding_table.image_start;
3760       } else {
3761          bld.ADD(image, image,
3762                  brw_imm_d(stage_prog_data->binding_table.image_start));
3763       }
3764    }
3765
3766    return bld.emit_uniformize(image);
3767 }
3768
3769 fs_reg
3770 fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
3771                                          nir_intrinsic_instr *instr)
3772 {
3773    /* SSBO stores are weird in that their index is in src[1] */
3774    const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
3775
3776    fs_reg surf_index;
3777    if (nir_src_is_const(instr->src[src])) {
3778       unsigned index = stage_prog_data->binding_table.ssbo_start +
3779                        nir_src_as_uint(instr->src[src]);
3780       surf_index = brw_imm_ud(index);
3781    } else {
3782       surf_index = vgrf(glsl_type::uint_type);
3783       bld.ADD(surf_index, get_nir_src(instr->src[src]),
3784               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3785    }
3786
3787    return bld.emit_uniformize(surf_index);
3788 }
3789
3790 static unsigned
3791 image_intrinsic_coord_components(nir_intrinsic_instr *instr)
3792 {
3793    switch (nir_intrinsic_image_dim(instr)) {
3794    case GLSL_SAMPLER_DIM_1D:
3795       return 1 + nir_intrinsic_image_array(instr);
3796    case GLSL_SAMPLER_DIM_2D:
3797    case GLSL_SAMPLER_DIM_RECT:
3798       return 2 + nir_intrinsic_image_array(instr);
3799    case GLSL_SAMPLER_DIM_3D:
3800    case GLSL_SAMPLER_DIM_CUBE:
3801       return 3;
3802    case GLSL_SAMPLER_DIM_BUF:
3803       return 1;
3804    case GLSL_SAMPLER_DIM_MS:
3805       return 2 + nir_intrinsic_image_array(instr);
3806    default:
3807       unreachable("Invalid image dimension");
3808    }
3809 }
3810
3811 void
3812 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3813 {
3814    fs_reg dest;
3815    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3816       dest = get_nir_dest(instr->dest);
3817
3818    switch (instr->intrinsic) {
3819    case nir_intrinsic_image_load:
3820    case nir_intrinsic_image_store:
3821    case nir_intrinsic_image_atomic_add:
3822    case nir_intrinsic_image_atomic_min:
3823    case nir_intrinsic_image_atomic_max:
3824    case nir_intrinsic_image_atomic_and:
3825    case nir_intrinsic_image_atomic_or:
3826    case nir_intrinsic_image_atomic_xor:
3827    case nir_intrinsic_image_atomic_exchange:
3828    case nir_intrinsic_image_atomic_comp_swap: {
3829       if (stage == MESA_SHADER_FRAGMENT &&
3830           instr->intrinsic != nir_intrinsic_image_load)
3831          brw_wm_prog_data(prog_data)->has_side_effects = true;
3832
3833       /* Get some metadata from the image intrinsic. */
3834       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3835       const GLenum format = nir_intrinsic_format(instr);
3836
3837       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3838       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
3839          get_nir_image_intrinsic_image(bld, instr);
3840       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3841       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
3842          brw_imm_ud(image_intrinsic_coord_components(instr));
3843
3844       /* Emit an image load, store or atomic op. */
3845       if (instr->intrinsic == nir_intrinsic_image_load) {
3846          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3847          fs_inst *inst =
3848             bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
3849                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3850          inst->size_written = instr->num_components * dispatch_width * 4;
3851       } else if (instr->intrinsic == nir_intrinsic_image_store) {
3852          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3853          srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
3854          bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
3855                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3856       } else {
3857          int op;
3858          unsigned num_srcs = info->num_srcs;
3859
3860          switch (instr->intrinsic) {
3861          case nir_intrinsic_image_atomic_add:
3862             assert(num_srcs == 4);
3863
3864             op = get_op_for_atomic_add(instr, 3);
3865
3866             if (op != BRW_AOP_ADD)
3867                num_srcs = 3;
3868             break;
3869          case nir_intrinsic_image_atomic_min:
3870             assert(format == GL_R32UI || format == GL_R32I);
3871             op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN;
3872             break;
3873          case nir_intrinsic_image_atomic_max:
3874             assert(format == GL_R32UI || format == GL_R32I);
3875             op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX;
3876             break;
3877          case nir_intrinsic_image_atomic_and:
3878             op = BRW_AOP_AND;
3879             break;
3880          case nir_intrinsic_image_atomic_or:
3881             op = BRW_AOP_OR;
3882             break;
3883          case nir_intrinsic_image_atomic_xor:
3884             op = BRW_AOP_XOR;
3885             break;
3886          case nir_intrinsic_image_atomic_exchange:
3887             op = BRW_AOP_MOV;
3888             break;
3889          case nir_intrinsic_image_atomic_comp_swap:
3890             op = BRW_AOP_CMPWR;
3891             break;
3892          default:
3893             unreachable("Not reachable.");
3894          }
3895
3896          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
3897
3898          fs_reg data;
3899          if (num_srcs >= 4)
3900             data = get_nir_src(instr->src[3]);
3901          if (num_srcs >= 5) {
3902             fs_reg tmp = bld.vgrf(data.type, 2);
3903             fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
3904             bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
3905             data = tmp;
3906          }
3907          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3908
3909          bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
3910                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3911       }
3912       break;
3913    }
3914
3915    case nir_intrinsic_image_size: {
3916       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
3917        * into will handle the binding table index for us in the geneerator.
3918        */
3919       fs_reg image = retype(get_nir_src_imm(instr->src[0]),
3920                             BRW_REGISTER_TYPE_UD);
3921       image = bld.emit_uniformize(image);
3922
3923       fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3924       srcs[TEX_LOGICAL_SRC_SURFACE] = image;
3925       srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
3926       srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
3927       srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
3928
3929       /* Since the image size is always uniform, we can just emit a SIMD8
3930        * query instruction and splat the result out.
3931        */
3932       const fs_builder ubld = bld.exec_all().group(8, 0);
3933
3934       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
3935       fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
3936                                 tmp, srcs, ARRAY_SIZE(srcs));
3937       inst->size_written = 4 * REG_SIZE;
3938
3939       for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
3940          if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {
3941             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3942                      offset(retype(dest, tmp.type), bld, c),
3943                      component(offset(tmp, ubld, c), 0), brw_imm_ud(6));
3944          } else {
3945             bld.MOV(offset(retype(dest, tmp.type), bld, c),
3946                     component(offset(tmp, ubld, c), 0));
3947          }
3948       }
3949       break;
3950    }
3951
3952    case nir_intrinsic_image_load_raw_intel: {
3953       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3954       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
3955          get_nir_image_intrinsic_image(bld, instr);
3956       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3957       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3958       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3959
3960       fs_inst *inst =
3961          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3962                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3963       inst->size_written = instr->num_components * dispatch_width * 4;
3964       break;
3965    }
3966
3967    case nir_intrinsic_image_store_raw_intel: {
3968       if (stage == MESA_SHADER_FRAGMENT)
3969          brw_wm_prog_data(prog_data)->has_side_effects = true;
3970
3971       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3972       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
3973          get_nir_image_intrinsic_image(bld, instr);
3974       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3975       srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
3976       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3977       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3978
3979       bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3980                fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3981       break;
3982    }
3983
3984    case nir_intrinsic_group_memory_barrier:
3985    case nir_intrinsic_memory_barrier_shared:
3986    case nir_intrinsic_memory_barrier_atomic_counter:
3987    case nir_intrinsic_memory_barrier_buffer:
3988    case nir_intrinsic_memory_barrier_image:
3989    case nir_intrinsic_memory_barrier: {
3990       const fs_builder ubld = bld.group(8, 0);
3991       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3992       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3993          ->size_written = 2 * REG_SIZE;
3994       break;
3995    }
3996
3997    case nir_intrinsic_shader_clock: {
3998       /* We cannot do anything if there is an event, so ignore it for now */
3999       const fs_reg shader_clock = get_timestamp(bld);
4000       const fs_reg srcs[] = { component(shader_clock, 0),
4001                               component(shader_clock, 1) };
4002       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4003       break;
4004    }
4005
4006    case nir_intrinsic_image_samples:
4007       /* The driver does not support multi-sampled images. */
4008       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
4009       break;
4010
4011    case nir_intrinsic_load_uniform: {
4012       /* Offsets are in bytes but they should always aligned to
4013        * the type size
4014        */
4015       assert(instr->const_index[0] % 4 == 0 ||
4016              instr->const_index[0] % type_sz(dest.type) == 0);
4017
4018       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
4019
4020       if (nir_src_is_const(instr->src[0])) {
4021          unsigned load_offset = nir_src_as_uint(instr->src[0]);
4022          assert(load_offset % type_sz(dest.type) == 0);
4023          /* For 16-bit types we add the module of the const_index[0]
4024           * offset to access to not 32-bit aligned element
4025           */
4026          src.offset = load_offset + instr->const_index[0] % 4;
4027
4028          for (unsigned j = 0; j < instr->num_components; j++) {
4029             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4030          }
4031       } else {
4032          fs_reg indirect = retype(get_nir_src(instr->src[0]),
4033                                   BRW_REGISTER_TYPE_UD);
4034
4035          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4036           * go past the end of the uniform.  In order to keep the n'th
4037           * component from running past, we subtract off the size of all but
4038           * one component of the vector.
4039           */
4040          assert(instr->const_index[1] >=
4041                 instr->num_components * (int) type_sz(dest.type));
4042          unsigned read_size = instr->const_index[1] -
4043             (instr->num_components - 1) * type_sz(dest.type);
4044
4045          bool supports_64bit_indirects =
4046             !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
4047
4048          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4049             for (unsigned j = 0; j < instr->num_components; j++) {
4050                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4051                         offset(dest, bld, j), offset(src, bld, j),
4052                         indirect, brw_imm_ud(read_size));
4053             }
4054          } else {
4055             const unsigned num_mov_indirects =
4056                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4057             /* We read a little bit less per MOV INDIRECT, as they are now
4058              * 32-bits ones instead of 64-bit. Fix read_size then.
4059              */
4060             const unsigned read_size_32bit = read_size -
4061                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4062             for (unsigned j = 0; j < instr->num_components; j++) {
4063                for (unsigned i = 0; i < num_mov_indirects; i++) {
4064                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4065                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4066                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4067                            indirect, brw_imm_ud(read_size_32bit));
4068                }
4069             }
4070          }
4071       }
4072       break;
4073    }
4074
4075    case nir_intrinsic_load_ubo: {
4076       fs_reg surf_index;
4077       if (nir_src_is_const(instr->src[0])) {
4078          const unsigned index = stage_prog_data->binding_table.ubo_start +
4079                                 nir_src_as_uint(instr->src[0]);
4080          surf_index = brw_imm_ud(index);
4081       } else {
4082          /* The block index is not a constant. Evaluate the index expression
4083           * per-channel and add the base UBO index; we have to select a value
4084           * from any live channel.
4085           */
4086          surf_index = vgrf(glsl_type::uint_type);
4087          bld.ADD(surf_index, get_nir_src(instr->src[0]),
4088                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4089          surf_index = bld.emit_uniformize(surf_index);
4090       }
4091
4092       if (!nir_src_is_const(instr->src[1])) {
4093          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4094                                      BRW_REGISTER_TYPE_UD);
4095
4096          for (int i = 0; i < instr->num_components; i++)
4097             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4098                                        base_offset, i * type_sz(dest.type));
4099       } else {
4100          /* Even if we are loading doubles, a pull constant load will load
4101           * a 32-bit vec4, so should only reserve vgrf space for that. If we
4102           * need to load a full dvec4 we will have to emit 2 loads. This is
4103           * similar to demote_pull_constants(), except that in that case we
4104           * see individual accesses to each component of the vector and then
4105           * we let CSE deal with duplicate loads. Here we see a vector access
4106           * and we have to split it if necessary.
4107           */
4108          const unsigned type_size = type_sz(dest.type);
4109          const unsigned load_offset = nir_src_as_uint(instr->src[1]);
4110
4111          /* See if we've selected this as a push constant candidate */
4112          if (nir_src_is_const(instr->src[0])) {
4113             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
4114             const unsigned offset_256b = load_offset / 32;
4115
4116             fs_reg push_reg;
4117             for (int i = 0; i < 4; i++) {
4118                const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4119                if (range->block == ubo_block &&
4120                    offset_256b >= range->start &&
4121                    offset_256b < range->start + range->length) {
4122
4123                   push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4124                   push_reg.offset = load_offset - 32 * range->start;
4125                   break;
4126                }
4127             }
4128
4129             if (push_reg.file != BAD_FILE) {
4130                for (unsigned i = 0; i < instr->num_components; i++) {
4131                   bld.MOV(offset(dest, bld, i),
4132                           byte_offset(push_reg, i * type_size));
4133                }
4134                break;
4135             }
4136          }
4137
4138          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4139          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4140          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4141
4142          for (unsigned c = 0; c < instr->num_components;) {
4143             const unsigned base = load_offset + c * type_size;
4144             /* Number of usable components in the next block-aligned load. */
4145             const unsigned count = MIN2(instr->num_components - c,
4146                                         (block_sz - base % block_sz) / type_size);
4147
4148             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4149                       packed_consts, surf_index,
4150                       brw_imm_ud(base & ~(block_sz - 1)));
4151
4152             const fs_reg consts =
4153                retype(byte_offset(packed_consts, base & (block_sz - 1)),
4154                       dest.type);
4155
4156             for (unsigned d = 0; d < count; d++)
4157                bld.MOV(offset(dest, bld, c + d), component(consts, d));
4158
4159             c += count;
4160          }
4161       }
4162       break;
4163    }
4164
4165    case nir_intrinsic_load_global: {
4166       assert(devinfo->gen >= 8);
4167
4168       if (nir_intrinsic_align(instr) >= 4) {
4169          assert(nir_dest_bit_size(instr->dest) == 32);
4170          fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
4171                                   dest,
4172                                   get_nir_src(instr->src[0]), /* Address */
4173                                   fs_reg(), /* No source data */
4174                                   brw_imm_ud(instr->num_components));
4175          inst->size_written = instr->num_components *
4176                               inst->dst.component_size(inst->exec_size);
4177       } else {
4178          const unsigned bit_size = nir_dest_bit_size(instr->dest);
4179          assert(bit_size <= 32);
4180          assert(nir_dest_num_components(instr->dest) == 1);
4181          brw_reg_type data_type =
4182             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4183          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4184          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
4185                   tmp,
4186                   get_nir_src(instr->src[0]), /* Address */
4187                   fs_reg(), /* No source data */
4188                   brw_imm_ud(bit_size));
4189          bld.MOV(retype(dest, data_type), tmp);
4190       }
4191       break;
4192    }
4193
4194    case nir_intrinsic_store_global:
4195       assert(devinfo->gen >= 8);
4196
4197       if (stage == MESA_SHADER_FRAGMENT)
4198          brw_wm_prog_data(prog_data)->has_side_effects = true;
4199
4200       if (nir_intrinsic_align(instr) >= 4) {
4201          assert(nir_src_bit_size(instr->src[0]) == 32);
4202          bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
4203                   fs_reg(),
4204                   get_nir_src(instr->src[1]), /* Address */
4205                   get_nir_src(instr->src[0]), /* Data */
4206                   brw_imm_ud(instr->num_components));
4207       } else {
4208          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4209          assert(bit_size <= 32);
4210          assert(nir_src_num_components(instr->src[0]) == 1);
4211          brw_reg_type data_type =
4212             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4213          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4214          bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
4215          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
4216                   fs_reg(),
4217                   get_nir_src(instr->src[1]), /* Address */
4218                   tmp, /* Data */
4219                   brw_imm_ud(nir_src_bit_size(instr->src[0])));
4220       }
4221       break;
4222
4223    case nir_intrinsic_global_atomic_add:
4224       nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
4225       break;
4226    case nir_intrinsic_global_atomic_imin:
4227       nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr);
4228       break;
4229    case nir_intrinsic_global_atomic_umin:
4230       nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr);
4231       break;
4232    case nir_intrinsic_global_atomic_imax:
4233       nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr);
4234       break;
4235    case nir_intrinsic_global_atomic_umax:
4236       nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr);
4237       break;
4238    case nir_intrinsic_global_atomic_and:
4239       nir_emit_global_atomic(bld, BRW_AOP_AND, instr);
4240       break;
4241    case nir_intrinsic_global_atomic_or:
4242       nir_emit_global_atomic(bld, BRW_AOP_OR, instr);
4243       break;
4244    case nir_intrinsic_global_atomic_xor:
4245       nir_emit_global_atomic(bld, BRW_AOP_XOR, instr);
4246       break;
4247    case nir_intrinsic_global_atomic_exchange:
4248       nir_emit_global_atomic(bld, BRW_AOP_MOV, instr);
4249       break;
4250    case nir_intrinsic_global_atomic_comp_swap:
4251       nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr);
4252       break;
4253    case nir_intrinsic_global_atomic_fmin:
4254       nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr);
4255       break;
4256    case nir_intrinsic_global_atomic_fmax:
4257       nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr);
4258       break;
4259    case nir_intrinsic_global_atomic_fcomp_swap:
4260       nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4261       break;
4262
4263    case nir_intrinsic_load_ssbo: {
4264       assert(devinfo->gen >= 7);
4265
4266       const unsigned bit_size = nir_dest_bit_size(instr->dest);
4267       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4268       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4269          get_nir_ssbo_intrinsic_index(bld, instr);
4270       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4271       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4272
4273       /* Make dest unsigned because that's what the temporary will be */
4274       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4275
4276       /* Read the vector */
4277       if (nir_intrinsic_align(instr) >= 4) {
4278          assert(nir_dest_bit_size(instr->dest) == 32);
4279          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4280          fs_inst *inst =
4281             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4282                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4283          inst->size_written = instr->num_components * dispatch_width * 4;
4284       } else {
4285          assert(nir_dest_bit_size(instr->dest) <= 32);
4286          assert(nir_dest_num_components(instr->dest) == 1);
4287          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4288
4289          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4290          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4291                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4292          bld.MOV(dest, read_result);
4293       }
4294       break;
4295    }
4296
4297    case nir_intrinsic_store_ssbo: {
4298       assert(devinfo->gen >= 7);
4299
4300       if (stage == MESA_SHADER_FRAGMENT)
4301          brw_wm_prog_data(prog_data)->has_side_effects = true;
4302
4303       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4304       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4305       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4306          get_nir_ssbo_intrinsic_index(bld, instr);
4307       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
4308       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4309
4310       fs_reg data = get_nir_src(instr->src[0]);
4311       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4312
4313       assert(nir_intrinsic_write_mask(instr) ==
4314              (1u << instr->num_components) - 1);
4315       if (nir_intrinsic_align(instr) >= 4) {
4316          assert(nir_src_bit_size(instr->src[0]) == 32);
4317          assert(nir_src_num_components(instr->src[0]) <= 4);
4318          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4319          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4320          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4321                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4322       } else {
4323          assert(nir_src_bit_size(instr->src[0]) <= 32);
4324          assert(nir_src_num_components(instr->src[0]) == 1);
4325          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4326
4327          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4328          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4329
4330          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4331                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4332       }
4333       break;
4334    }
4335
4336    case nir_intrinsic_store_output: {
4337       fs_reg src = get_nir_src(instr->src[0]);
4338
4339       unsigned store_offset = nir_src_as_uint(instr->src[1]);
4340       unsigned num_components = instr->num_components;
4341       unsigned first_component = nir_intrinsic_component(instr);
4342       if (nir_src_bit_size(instr->src[0]) == 64) {
4343          src = shuffle_for_32bit_write(bld, src, 0, num_components);
4344          num_components *= 2;
4345       }
4346
4347       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4348                                       4 * store_offset), src.type);
4349       for (unsigned j = 0; j < num_components; j++) {
4350          bld.MOV(offset(new_dest, bld, j + first_component),
4351                  offset(src, bld, j));
4352       }
4353       break;
4354    }
4355
4356    case nir_intrinsic_ssbo_atomic_add:
4357       nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr);
4358       break;
4359    case nir_intrinsic_ssbo_atomic_imin:
4360       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4361       break;
4362    case nir_intrinsic_ssbo_atomic_umin:
4363       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4364       break;
4365    case nir_intrinsic_ssbo_atomic_imax:
4366       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4367       break;
4368    case nir_intrinsic_ssbo_atomic_umax:
4369       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4370       break;
4371    case nir_intrinsic_ssbo_atomic_and:
4372       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4373       break;
4374    case nir_intrinsic_ssbo_atomic_or:
4375       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4376       break;
4377    case nir_intrinsic_ssbo_atomic_xor:
4378       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4379       break;
4380    case nir_intrinsic_ssbo_atomic_exchange:
4381       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4382       break;
4383    case nir_intrinsic_ssbo_atomic_comp_swap:
4384       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4385       break;
4386    case nir_intrinsic_ssbo_atomic_fmin:
4387       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr);
4388       break;
4389    case nir_intrinsic_ssbo_atomic_fmax:
4390       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr);
4391       break;
4392    case nir_intrinsic_ssbo_atomic_fcomp_swap:
4393       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4394       break;
4395
4396    case nir_intrinsic_get_buffer_size: {
4397       assert(nir_src_num_components(instr->src[0]) == 1);
4398       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
4399                             nir_src_as_uint(instr->src[0]) : 0;
4400
4401       /* A resinfo's sampler message is used to get the buffer size.  The
4402        * SIMD8's writeback message consists of four registers and SIMD16's
4403        * writeback message consists of 8 destination registers (two per each
4404        * component).  Because we are only interested on the first channel of
4405        * the first returned component, where resinfo returns the buffer size
4406        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4407        * the dispatch width.
4408        */
4409       const fs_builder ubld = bld.exec_all().group(8, 0);
4410       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4411       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4412
4413       /* Set LOD = 0 */
4414       ubld.MOV(src_payload, brw_imm_d(0));
4415
4416       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4417       fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
4418                                 src_payload, brw_imm_ud(index));
4419       inst->header_size = 0;
4420       inst->mlen = 1;
4421       inst->size_written = 4 * REG_SIZE;
4422
4423       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
4424        *
4425        * "Out-of-bounds checking is always performed at a DWord granularity. If
4426        * any part of the DWord is out-of-bounds then the whole DWord is
4427        * considered out-of-bounds."
4428        *
4429        * This implies that types with size smaller than 4-bytes need to be
4430        * padded if they don't complete the last dword of the buffer. But as we
4431        * need to maintain the original size we need to reverse the padding
4432        * calculation to return the correct size to know the number of elements
4433        * of an unsized array. As we stored in the last two bits of the surface
4434        * size the needed padding for the buffer, we calculate here the
4435        * original buffer_size reversing the surface_size calculation:
4436        *
4437        * surface_size = isl_align(buffer_size, 4) +
4438        *                (isl_align(buffer_size) - buffer_size)
4439        *
4440        * buffer_size = surface_size & ~3 - surface_size & 3
4441        */
4442
4443       fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4444       fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4445       fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4446
4447       ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
4448       ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
4449       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
4450
4451       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
4452       break;
4453    }
4454
4455    case nir_intrinsic_load_subgroup_invocation:
4456       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
4457               nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
4458       break;
4459
4460    case nir_intrinsic_load_subgroup_eq_mask:
4461    case nir_intrinsic_load_subgroup_ge_mask:
4462    case nir_intrinsic_load_subgroup_gt_mask:
4463    case nir_intrinsic_load_subgroup_le_mask:
4464    case nir_intrinsic_load_subgroup_lt_mask:
4465       unreachable("not reached");
4466
4467    case nir_intrinsic_vote_any: {
4468       const fs_builder ubld = bld.exec_all().group(1, 0);
4469
4470       /* The any/all predicates do not consider channel enables. To prevent
4471        * dead channels from affecting the result, we initialize the flag with
4472        * with the identity value for the logical operation.
4473        */
4474       if (dispatch_width == 32) {
4475          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4476          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4477                          brw_imm_ud(0));
4478       } else {
4479          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
4480       }
4481       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4482
4483       /* For some reason, the any/all predicates don't work properly with
4484        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4485        * doesn't read the correct subset of the flag register and you end up
4486        * getting garbage in the second half.  Work around this by using a pair
4487        * of 1-wide MOVs and scattering the result.
4488        */
4489       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4490       ubld.MOV(res1, brw_imm_d(0));
4491       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
4492                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
4493                                            BRW_PREDICATE_ALIGN1_ANY32H,
4494                     ubld.MOV(res1, brw_imm_d(-1)));
4495
4496       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4497       break;
4498    }
4499    case nir_intrinsic_vote_all: {
4500       const fs_builder ubld = bld.exec_all().group(1, 0);
4501
4502       /* The any/all predicates do not consider channel enables. To prevent
4503        * dead channels from affecting the result, we initialize the flag with
4504        * with the identity value for the logical operation.
4505        */
4506       if (dispatch_width == 32) {
4507          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4508          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4509                          brw_imm_ud(0xffffffff));
4510       } else {
4511          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4512       }
4513       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4514
4515       /* For some reason, the any/all predicates don't work properly with
4516        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4517        * doesn't read the correct subset of the flag register and you end up
4518        * getting garbage in the second half.  Work around this by using a pair
4519        * of 1-wide MOVs and scattering the result.
4520        */
4521       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4522       ubld.MOV(res1, brw_imm_d(0));
4523       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4524                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4525                                            BRW_PREDICATE_ALIGN1_ALL32H,
4526                     ubld.MOV(res1, brw_imm_d(-1)));
4527
4528       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4529       break;
4530    }
4531    case nir_intrinsic_vote_feq:
4532    case nir_intrinsic_vote_ieq: {
4533       fs_reg value = get_nir_src(instr->src[0]);
4534       if (instr->intrinsic == nir_intrinsic_vote_feq) {
4535          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4536          value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
4537       }
4538
4539       fs_reg uniformized = bld.emit_uniformize(value);
4540       const fs_builder ubld = bld.exec_all().group(1, 0);
4541
4542       /* The any/all predicates do not consider channel enables. To prevent
4543        * dead channels from affecting the result, we initialize the flag with
4544        * with the identity value for the logical operation.
4545        */
4546       if (dispatch_width == 32) {
4547          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4548          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4549                          brw_imm_ud(0xffffffff));
4550       } else {
4551          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4552       }
4553       bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
4554
4555       /* For some reason, the any/all predicates don't work properly with
4556        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4557        * doesn't read the correct subset of the flag register and you end up
4558        * getting garbage in the second half.  Work around this by using a pair
4559        * of 1-wide MOVs and scattering the result.
4560        */
4561       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4562       ubld.MOV(res1, brw_imm_d(0));
4563       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4564                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4565                                            BRW_PREDICATE_ALIGN1_ALL32H,
4566                     ubld.MOV(res1, brw_imm_d(-1)));
4567
4568       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4569       break;
4570    }
4571
4572    case nir_intrinsic_ballot: {
4573       const fs_reg value = retype(get_nir_src(instr->src[0]),
4574                                   BRW_REGISTER_TYPE_UD);
4575       struct brw_reg flag = brw_flag_reg(0, 0);
4576       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
4577        * as f0.0.  This is a problem for fragment programs as we currently use
4578        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
4579        * programs yet so this isn't a problem.  When we do, something will
4580        * have to change.
4581        */
4582       if (dispatch_width == 32)
4583          flag.type = BRW_REGISTER_TYPE_UD;
4584
4585       bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
4586       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
4587
4588       if (instr->dest.ssa.bit_size > 32) {
4589          dest.type = BRW_REGISTER_TYPE_UQ;
4590       } else {
4591          dest.type = BRW_REGISTER_TYPE_UD;
4592       }
4593       bld.MOV(dest, flag);
4594       break;
4595    }
4596
4597    case nir_intrinsic_read_invocation: {
4598       const fs_reg value = get_nir_src(instr->src[0]);
4599       const fs_reg invocation = get_nir_src(instr->src[1]);
4600       fs_reg tmp = bld.vgrf(value.type);
4601
4602       bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
4603                           bld.emit_uniformize(invocation));
4604
4605       bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
4606       break;
4607    }
4608
4609    case nir_intrinsic_read_first_invocation: {
4610       const fs_reg value = get_nir_src(instr->src[0]);
4611       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
4612       break;
4613    }
4614
4615    case nir_intrinsic_shuffle: {
4616       const fs_reg value = get_nir_src(instr->src[0]);
4617       const fs_reg index = get_nir_src(instr->src[1]);
4618
4619       bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
4620       break;
4621    }
4622
4623    case nir_intrinsic_first_invocation: {
4624       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4625       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
4626       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
4627               fs_reg(component(tmp, 0)));
4628       break;
4629    }
4630
4631    case nir_intrinsic_quad_broadcast: {
4632       const fs_reg value = get_nir_src(instr->src[0]);
4633       const unsigned index = nir_src_as_uint(instr->src[1]);
4634
4635       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
4636                value, brw_imm_ud(index), brw_imm_ud(4));
4637       break;
4638    }
4639
4640    case nir_intrinsic_quad_swap_horizontal: {
4641       const fs_reg value = get_nir_src(instr->src[0]);
4642       const fs_reg tmp = bld.vgrf(value.type);
4643       const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
4644
4645       const fs_reg src_left = horiz_stride(value, 2);
4646       const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
4647       const fs_reg tmp_left = horiz_stride(tmp, 2);
4648       const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
4649
4650       ubld.MOV(tmp_left, src_right);
4651       ubld.MOV(tmp_right, src_left);
4652
4653       bld.MOV(retype(dest, value.type), tmp);
4654       break;
4655    }
4656
4657    case nir_intrinsic_quad_swap_vertical: {
4658       const fs_reg value = get_nir_src(instr->src[0]);
4659       if (nir_src_bit_size(instr->src[0]) == 32) {
4660          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4661          const fs_reg tmp = bld.vgrf(value.type);
4662          const fs_builder ubld = bld.exec_all();
4663          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4664                    brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
4665          bld.MOV(retype(dest, value.type), tmp);
4666       } else {
4667          /* For larger data types, we have to either emit dispatch_width many
4668           * MOVs or else fall back to doing indirects.
4669           */
4670          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4671          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4672                       brw_imm_w(0x2));
4673          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4674       }
4675       break;
4676    }
4677
4678    case nir_intrinsic_quad_swap_diagonal: {
4679       const fs_reg value = get_nir_src(instr->src[0]);
4680       if (nir_src_bit_size(instr->src[0]) == 32) {
4681          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4682          const fs_reg tmp = bld.vgrf(value.type);
4683          const fs_builder ubld = bld.exec_all();
4684          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4685                    brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
4686          bld.MOV(retype(dest, value.type), tmp);
4687       } else {
4688          /* For larger data types, we have to either emit dispatch_width many
4689           * MOVs or else fall back to doing indirects.
4690           */
4691          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4692          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4693                       brw_imm_w(0x3));
4694          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4695       }
4696       break;
4697    }
4698
4699    case nir_intrinsic_reduce: {
4700       fs_reg src = get_nir_src(instr->src[0]);
4701       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4702       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
4703       if (cluster_size == 0 || cluster_size > dispatch_width)
4704          cluster_size = dispatch_width;
4705
4706       /* Figure out the source type */
4707       src.type = brw_type_for_nir_type(devinfo,
4708          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4709                         nir_src_bit_size(instr->src[0])));
4710
4711       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4712       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4713       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4714
4715       /* Set up a register for all of our scratching around and initialize it
4716        * to reduction operation's identity value.
4717        */
4718       fs_reg scan = bld.vgrf(src.type);
4719       bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4720
4721       bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
4722
4723       dest.type = src.type;
4724       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
4725          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
4726           * the distance between clusters is at least 2 GRFs.  In this case,
4727           * we don't need the weird striding of the CLUSTER_BROADCAST
4728           * instruction and can just do regular MOVs.
4729           */
4730          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
4731          const unsigned groups =
4732             (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
4733          const unsigned group_size = dispatch_width / groups;
4734          for (unsigned i = 0; i < groups; i++) {
4735             const unsigned cluster = (i * group_size) / cluster_size;
4736             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
4737             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
4738                                          component(scan, comp));
4739          }
4740       } else {
4741          bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
4742                   brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
4743       }
4744       break;
4745    }
4746
4747    case nir_intrinsic_inclusive_scan:
4748    case nir_intrinsic_exclusive_scan: {
4749       fs_reg src = get_nir_src(instr->src[0]);
4750       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4751
4752       /* Figure out the source type */
4753       src.type = brw_type_for_nir_type(devinfo,
4754          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4755                         nir_src_bit_size(instr->src[0])));
4756
4757       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4758       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4759       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4760
4761       /* Set up a register for all of our scratching around and initialize it
4762        * to reduction operation's identity value.
4763        */
4764       fs_reg scan = bld.vgrf(src.type);
4765       const fs_builder allbld = bld.exec_all();
4766       allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4767
4768       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
4769          /* Exclusive scan is a bit harder because we have to do an annoying
4770           * shift of the contents before we can begin.  To make things worse,
4771           * we can't do this with a normal stride; we have to use indirects.
4772           */
4773          fs_reg shifted = bld.vgrf(src.type);
4774          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4775          allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4776                          brw_imm_w(-1));
4777          allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
4778          allbld.group(1, 0).MOV(component(shifted, 0), identity);
4779          scan = shifted;
4780       }
4781
4782       bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
4783
4784       bld.MOV(retype(dest, src.type), scan);
4785       break;
4786    }
4787
4788    case nir_intrinsic_begin_invocation_interlock: {
4789       const fs_builder ubld = bld.group(8, 0);
4790       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4791
4792       ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
4793          REG_SIZE;
4794
4795       break;
4796    }
4797
4798    case nir_intrinsic_end_invocation_interlock: {
4799       /* We don't need to do anything here */
4800       break;
4801    }
4802
4803    default:
4804       unreachable("unknown intrinsic");
4805    }
4806 }
4807
4808 void
4809 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4810                                  int op, nir_intrinsic_instr *instr)
4811 {
4812    if (stage == MESA_SHADER_FRAGMENT)
4813       brw_wm_prog_data(prog_data)->has_side_effects = true;
4814
4815    fs_reg dest;
4816    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4817       dest = get_nir_dest(instr->dest);
4818
4819    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4820    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
4821    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4822    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4823    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4824
4825    fs_reg data;
4826    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4827       data = get_nir_src(instr->src[2]);
4828
4829    if (op == BRW_AOP_CMPWR) {
4830       fs_reg tmp = bld.vgrf(data.type, 2);
4831       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
4832       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4833       data = tmp;
4834    }
4835    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4836
4837    /* Emit the actual atomic operation */
4838
4839    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
4840             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4841 }
4842
4843 void
4844 fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
4845                                        int op, nir_intrinsic_instr *instr)
4846 {
4847    if (stage == MESA_SHADER_FRAGMENT)
4848       brw_wm_prog_data(prog_data)->has_side_effects = true;
4849
4850    fs_reg dest;
4851    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4852       dest = get_nir_dest(instr->dest);
4853
4854    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4855    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
4856    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4857    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4858    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4859
4860    fs_reg data = get_nir_src(instr->src[2]);
4861    if (op == BRW_AOP_FCMPWR) {
4862       fs_reg tmp = bld.vgrf(data.type, 2);
4863       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
4864       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4865       data = tmp;
4866    }
4867    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4868
4869    /* Emit the actual atomic operation */
4870
4871    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
4872             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4873 }
4874
4875 void
4876 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4877                                    int op, nir_intrinsic_instr *instr)
4878 {
4879    fs_reg dest;
4880    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4881       dest = get_nir_dest(instr->dest);
4882
4883    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4884    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
4885    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4886    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4887
4888    fs_reg data;
4889    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4890       data = get_nir_src(instr->src[1]);
4891    if (op == BRW_AOP_CMPWR) {
4892       fs_reg tmp = bld.vgrf(data.type, 2);
4893       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
4894       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4895       data = tmp;
4896    }
4897    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4898
4899    /* Get the offset */
4900    if (nir_src_is_const(instr->src[0])) {
4901       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4902          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
4903    } else {
4904       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
4905       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
4906               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4907               brw_imm_ud(instr->const_index[0]));
4908    }
4909
4910    /* Emit the actual atomic operation operation */
4911
4912    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
4913             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4914 }
4915
4916 void
4917 fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
4918                                          int op, nir_intrinsic_instr *instr)
4919 {
4920    fs_reg dest;
4921    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4922       dest = get_nir_dest(instr->dest);
4923
4924    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4925    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
4926    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4927    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4928
4929    fs_reg data = get_nir_src(instr->src[1]);
4930    if (op == BRW_AOP_FCMPWR) {
4931       fs_reg tmp = bld.vgrf(data.type, 2);
4932       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
4933       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4934       data = tmp;
4935    }
4936    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4937
4938    /* Get the offset */
4939    if (nir_src_is_const(instr->src[0])) {
4940       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4941          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
4942    } else {
4943       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
4944       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
4945               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4946               brw_imm_ud(instr->const_index[0]));
4947    }
4948
4949    /* Emit the actual atomic operation operation */
4950
4951    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
4952             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4953 }
4954
4955 void
4956 fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
4957                                    int op, nir_intrinsic_instr *instr)
4958 {
4959    if (stage == MESA_SHADER_FRAGMENT)
4960       brw_wm_prog_data(prog_data)->has_side_effects = true;
4961
4962    fs_reg dest;
4963    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4964       dest = get_nir_dest(instr->dest);
4965
4966    fs_reg addr = get_nir_src(instr->src[0]);
4967
4968    fs_reg data;
4969    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4970       data = get_nir_src(instr->src[1]);
4971
4972    if (op == BRW_AOP_CMPWR) {
4973       fs_reg tmp = bld.vgrf(data.type, 2);
4974       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
4975       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4976       data = tmp;
4977    }
4978
4979    bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
4980             dest, addr, data, brw_imm_ud(op));
4981 }
4982
4983 void
4984 fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
4985                                          int op, nir_intrinsic_instr *instr)
4986 {
4987    if (stage == MESA_SHADER_FRAGMENT)
4988       brw_wm_prog_data(prog_data)->has_side_effects = true;
4989
4990    assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
4991    fs_reg dest = get_nir_dest(instr->dest);
4992
4993    fs_reg addr = get_nir_src(instr->src[0]);
4994
4995    assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);
4996    fs_reg data = get_nir_src(instr->src[1]);
4997
4998    if (op == BRW_AOP_FCMPWR) {
4999       fs_reg tmp = bld.vgrf(data.type, 2);
5000       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5001       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5002       data = tmp;
5003    }
5004
5005    bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
5006             dest, addr, data, brw_imm_ud(op));
5007 }
5008
5009 void
5010 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
5011 {
5012    unsigned texture = instr->texture_index;
5013    unsigned sampler = instr->sampler_index;
5014
5015    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
5016
5017    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
5018    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
5019
5020    int lod_components = 0;
5021
5022    /* The hardware requires a LOD for buffer textures */
5023    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
5024       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
5025
5026    uint32_t header_bits = 0;
5027    for (unsigned i = 0; i < instr->num_srcs; i++) {
5028       fs_reg src = get_nir_src(instr->src[i].src);
5029       switch (instr->src[i].src_type) {
5030       case nir_tex_src_bias:
5031          srcs[TEX_LOGICAL_SRC_LOD] =
5032             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5033          break;
5034       case nir_tex_src_comparator:
5035          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
5036          break;
5037       case nir_tex_src_coord:
5038          switch (instr->op) {
5039          case nir_texop_txf:
5040          case nir_texop_txf_ms:
5041          case nir_texop_txf_ms_mcs:
5042          case nir_texop_samples_identical:
5043             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
5044             break;
5045          default:
5046             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
5047             break;
5048          }
5049          break;
5050       case nir_tex_src_ddx:
5051          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
5052          lod_components = nir_tex_instr_src_size(instr, i);
5053          break;
5054       case nir_tex_src_ddy:
5055          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
5056          break;
5057       case nir_tex_src_lod:
5058          switch (instr->op) {
5059          case nir_texop_txs:
5060             srcs[TEX_LOGICAL_SRC_LOD] =
5061                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
5062             break;
5063          case nir_texop_txf:
5064             srcs[TEX_LOGICAL_SRC_LOD] =
5065                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
5066             break;
5067          default:
5068             srcs[TEX_LOGICAL_SRC_LOD] =
5069                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5070             break;
5071          }
5072          break;
5073       case nir_tex_src_min_lod:
5074          srcs[TEX_LOGICAL_SRC_MIN_LOD] =
5075             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5076          break;
5077       case nir_tex_src_ms_index:
5078          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
5079          break;
5080
5081       case nir_tex_src_offset: {
5082          uint32_t offset_bits = 0;
5083          if (brw_texture_offset(instr, i, &offset_bits)) {
5084             header_bits |= offset_bits;
5085          } else {
5086             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
5087                retype(src, BRW_REGISTER_TYPE_D);
5088          }
5089          break;
5090       }
5091
5092       case nir_tex_src_projector:
5093          unreachable("should be lowered");
5094
5095       case nir_tex_src_texture_offset: {
5096          /* Emit code to evaluate the actual indexing expression */
5097          fs_reg tmp = vgrf(glsl_type::uint_type);
5098          bld.ADD(tmp, src, brw_imm_ud(texture));
5099          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
5100          break;
5101       }
5102
5103       case nir_tex_src_sampler_offset: {
5104          /* Emit code to evaluate the actual indexing expression */
5105          fs_reg tmp = vgrf(glsl_type::uint_type);
5106          bld.ADD(tmp, src, brw_imm_ud(sampler));
5107          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
5108          break;
5109       }
5110
5111       case nir_tex_src_ms_mcs:
5112          assert(instr->op == nir_texop_txf_ms);
5113          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
5114          break;
5115
5116       case nir_tex_src_plane: {
5117          const uint32_t plane = nir_src_as_uint(instr->src[i].src);
5118          const uint32_t texture_index =
5119             instr->texture_index +
5120             stage_prog_data->binding_table.plane_start[plane] -
5121             stage_prog_data->binding_table.texture_start;
5122
5123          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
5124          break;
5125       }
5126
5127       default:
5128          unreachable("unknown texture source");
5129       }
5130    }
5131
5132    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
5133        (instr->op == nir_texop_txf_ms ||
5134         instr->op == nir_texop_samples_identical)) {
5135       if (devinfo->gen >= 7 &&
5136           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
5137          srcs[TEX_LOGICAL_SRC_MCS] =
5138             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
5139                            instr->coord_components,
5140                            srcs[TEX_LOGICAL_SRC_SURFACE]);
5141       } else {
5142          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
5143       }
5144    }
5145
5146    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
5147    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
5148
5149    bool shader_supports_implicit_lod = stage == MESA_SHADER_FRAGMENT ||
5150       (stage == MESA_SHADER_COMPUTE &&
5151        nir->info.cs.derivative_group != DERIVATIVE_GROUP_NONE);
5152
5153    enum opcode opcode;
5154    switch (instr->op) {
5155    case nir_texop_tex:
5156       opcode = shader_supports_implicit_lod ?
5157          SHADER_OPCODE_TEX_LOGICAL : SHADER_OPCODE_TXL_LOGICAL;
5158       break;
5159    case nir_texop_txb:
5160       opcode = FS_OPCODE_TXB_LOGICAL;
5161       break;
5162    case nir_texop_txl:
5163       opcode = SHADER_OPCODE_TXL_LOGICAL;
5164       break;
5165    case nir_texop_txd:
5166       opcode = SHADER_OPCODE_TXD_LOGICAL;
5167       break;
5168    case nir_texop_txf:
5169       opcode = SHADER_OPCODE_TXF_LOGICAL;
5170       break;
5171    case nir_texop_txf_ms:
5172       if ((key_tex->msaa_16 & (1 << sampler)))
5173          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
5174       else
5175          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
5176       break;
5177    case nir_texop_txf_ms_mcs:
5178       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
5179       break;
5180    case nir_texop_query_levels:
5181    case nir_texop_txs:
5182       opcode = SHADER_OPCODE_TXS_LOGICAL;
5183       break;
5184    case nir_texop_lod:
5185       opcode = SHADER_OPCODE_LOD_LOGICAL;
5186       break;
5187    case nir_texop_tg4:
5188       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
5189          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
5190       else
5191          opcode = SHADER_OPCODE_TG4_LOGICAL;
5192       break;
5193    case nir_texop_texture_samples:
5194       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
5195       break;
5196    case nir_texop_samples_identical: {
5197       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
5198
5199       /* If mcs is an immediate value, it means there is no MCS.  In that case
5200        * just return false.
5201        */
5202       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
5203          bld.MOV(dst, brw_imm_ud(0u));
5204       } else if ((key_tex->msaa_16 & (1 << sampler))) {
5205          fs_reg tmp = vgrf(glsl_type::uint_type);
5206          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
5207                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
5208          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
5209       } else {
5210          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
5211                  BRW_CONDITIONAL_EQ);
5212       }
5213       return;
5214    }
5215    default:
5216       unreachable("unknown texture opcode");
5217    }
5218
5219    if (instr->op == nir_texop_tg4) {
5220       if (instr->component == 1 &&
5221           key_tex->gather_channel_quirk_mask & (1 << texture)) {
5222          /* gather4 sampler is broken for green channel on RG32F --
5223           * we must ask for blue instead.
5224           */
5225          header_bits |= 2 << 16;
5226       } else {
5227          header_bits |= instr->component << 16;
5228       }
5229    }
5230
5231    fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
5232    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
5233    inst->offset = header_bits;
5234
5235    const unsigned dest_size = nir_tex_instr_dest_size(instr);
5236    if (devinfo->gen >= 9 &&
5237        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
5238       unsigned write_mask = instr->dest.is_ssa ?
5239                             nir_ssa_def_components_read(&instr->dest.ssa):
5240                             (1 << dest_size) - 1;
5241       assert(write_mask != 0); /* dead code should have been eliminated */
5242       inst->size_written = util_last_bit(write_mask) *
5243                            inst->dst.component_size(inst->exec_size);
5244    } else {
5245       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
5246    }
5247
5248    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
5249       inst->shadow_compare = true;
5250
5251    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
5252       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
5253
5254    fs_reg nir_dest[4];
5255    for (unsigned i = 0; i < dest_size; i++)
5256       nir_dest[i] = offset(dst, bld, i);
5257
5258    if (instr->op == nir_texop_query_levels) {
5259       /* # levels is in .w */
5260       nir_dest[0] = offset(dst, bld, 3);
5261    } else if (instr->op == nir_texop_txs &&
5262               dest_size >= 3 && devinfo->gen < 7) {
5263       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
5264       fs_reg depth = offset(dst, bld, 2);
5265       nir_dest[2] = vgrf(glsl_type::int_type);
5266       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
5267    }
5268
5269    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
5270 }
5271
5272 void
5273 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
5274 {
5275    switch (instr->type) {
5276    case nir_jump_break:
5277       bld.emit(BRW_OPCODE_BREAK);
5278       break;
5279    case nir_jump_continue:
5280       bld.emit(BRW_OPCODE_CONTINUE);
5281       break;
5282    case nir_jump_return:
5283    default:
5284       unreachable("unknown jump");
5285    }
5286 }
5287
5288 /*
5289  * This helper takes a source register and un/shuffles it into the destination
5290  * register.
5291  *
5292  * If source type size is smaller than destination type size the operation
5293  * needed is a component shuffle. The opposite case would be an unshuffle. If
5294  * source/destination type size is equal a shuffle is done that would be
5295  * equivalent to a simple MOV.
5296  *
5297  * For example, if source is a 16-bit type and destination is 32-bit. A 3
5298  * components .xyz 16-bit vector on SIMD8 would be.
5299  *
5300  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
5301  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
5302  *
5303  * This helper will return the following 2 32-bit components with the 16-bit
5304  * values shuffled:
5305  *
5306  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
5307  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
5308  *
5309  * For unshuffle, the example would be the opposite, a 64-bit type source
5310  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
5311  * would be:
5312  *
5313  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
5314  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
5315  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
5316  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
5317  *
5318  * The returned result would be the following 4 32-bit components unshuffled:
5319  *
5320  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
5321  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
5322  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
5323  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
5324  *
5325  * - Source and destination register must not be overlapped.
5326  * - components units are measured in terms of the smaller type between
5327  *   source and destination because we are un/shuffling the smaller
5328  *   components from/into the bigger ones.
5329  * - first_component parameter allows skipping source components.
5330  */
5331 void
5332 shuffle_src_to_dst(const fs_builder &bld,
5333                    const fs_reg &dst,
5334                    const fs_reg &src,
5335                    uint32_t first_component,
5336                    uint32_t components)
5337 {
5338    if (type_sz(src.type) == type_sz(dst.type)) {
5339       assert(!regions_overlap(dst,
5340          type_sz(dst.type) * bld.dispatch_width() * components,
5341          offset(src, bld, first_component),
5342          type_sz(src.type) * bld.dispatch_width() * components));
5343       for (unsigned i = 0; i < components; i++) {
5344          bld.MOV(retype(offset(dst, bld, i), src.type),
5345                  offset(src, bld, i + first_component));
5346       }
5347    } else if (type_sz(src.type) < type_sz(dst.type)) {
5348       /* Source is shuffled into destination */
5349       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
5350       assert(!regions_overlap(dst,
5351          type_sz(dst.type) * bld.dispatch_width() *
5352          DIV_ROUND_UP(components, size_ratio),
5353          offset(src, bld, first_component),
5354          type_sz(src.type) * bld.dispatch_width() * components));
5355
5356       brw_reg_type shuffle_type =
5357          brw_reg_type_from_bit_size(8 * type_sz(src.type),
5358                                     BRW_REGISTER_TYPE_D);
5359       for (unsigned i = 0; i < components; i++) {
5360          fs_reg shuffle_component_i =
5361             subscript(offset(dst, bld, i / size_ratio),
5362                       shuffle_type, i % size_ratio);
5363          bld.MOV(shuffle_component_i,
5364                  retype(offset(src, bld, i + first_component), shuffle_type));
5365       }
5366    } else {
5367       /* Source is unshuffled into destination */
5368       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
5369       assert(!regions_overlap(dst,
5370          type_sz(dst.type) * bld.dispatch_width() * components,
5371          offset(src, bld, first_component / size_ratio),
5372          type_sz(src.type) * bld.dispatch_width() *
5373          DIV_ROUND_UP(components + (first_component % size_ratio),
5374                       size_ratio)));
5375
5376       brw_reg_type shuffle_type =
5377          brw_reg_type_from_bit_size(8 * type_sz(dst.type),
5378                                     BRW_REGISTER_TYPE_D);
5379       for (unsigned i = 0; i < components; i++) {
5380          fs_reg shuffle_component_i =
5381             subscript(offset(src, bld, (first_component + i) / size_ratio),
5382                       shuffle_type, (first_component + i) % size_ratio);
5383          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
5384                  shuffle_component_i);
5385       }
5386    }
5387 }
5388
5389 void
5390 shuffle_from_32bit_read(const fs_builder &bld,
5391                         const fs_reg &dst,
5392                         const fs_reg &src,
5393                         uint32_t first_component,
5394                         uint32_t components)
5395 {
5396    assert(type_sz(src.type) == 4);
5397
5398    /* This function takes components in units of the destination type while
5399     * shuffle_src_to_dst takes components in units of the smallest type
5400     */
5401    if (type_sz(dst.type) > 4) {
5402       assert(type_sz(dst.type) == 8);
5403       first_component *= 2;
5404       components *= 2;
5405    }
5406
5407    shuffle_src_to_dst(bld, dst, src, first_component, components);
5408 }
5409
5410 fs_reg
5411 shuffle_for_32bit_write(const fs_builder &bld,
5412                         const fs_reg &src,
5413                         uint32_t first_component,
5414                         uint32_t components)
5415 {
5416    fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
5417                          DIV_ROUND_UP (components * type_sz(src.type), 4));
5418    /* This function takes components in units of the source type while
5419     * shuffle_src_to_dst takes components in units of the smallest type
5420     */
5421    if (type_sz(src.type) > 4) {
5422       assert(type_sz(src.type) == 8);
5423       first_component *= 2;
5424       components *= 2;
5425    }
5426
5427    shuffle_src_to_dst(bld, dst, src, first_component, components);
5428
5429    return dst;
5430 }
5431
5432 fs_reg
5433 setup_imm_df(const fs_builder &bld, double v)
5434 {
5435    const struct gen_device_info *devinfo = bld.shader->devinfo;
5436    assert(devinfo->gen >= 7);
5437
5438    if (devinfo->gen >= 8)
5439       return brw_imm_df(v);
5440
5441    /* gen7.5 does not support DF immediates straighforward but the DIM
5442     * instruction allows to set the 64-bit immediate value.
5443     */
5444    if (devinfo->is_haswell) {
5445       const fs_builder ubld = bld.exec_all().group(1, 0);
5446       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
5447       ubld.DIM(dst, brw_imm_df(v));
5448       return component(dst, 0);
5449    }
5450
5451    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
5452     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
5453     * the high 32-bit to suboffset 4 and then applying a stride of 0.
5454     *
5455     * Alternatively, we could also produce a normal VGRF (without stride 0)
5456     * by writing to all the channels in the VGRF, however, that would hit the
5457     * gen7 bug where we have to split writes that span more than 1 register
5458     * into instructions with a width of 4 (otherwise the write to the second
5459     * register written runs into an execmask hardware bug) which isn't very
5460     * nice.
5461     */
5462    union {
5463       double d;
5464       struct {
5465          uint32_t i1;
5466          uint32_t i2;
5467       };
5468    } di;
5469
5470    di.d = v;
5471
5472    const fs_builder ubld = bld.exec_all().group(1, 0);
5473    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5474    ubld.MOV(tmp, brw_imm_ud(di.i1));
5475    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
5476
5477    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
5478 }
5479
5480 fs_reg
5481 setup_imm_b(const fs_builder &bld, int8_t v)
5482 {
5483    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
5484    bld.MOV(tmp, brw_imm_w(v));
5485    return tmp;
5486 }
5487
5488 fs_reg
5489 setup_imm_ub(const fs_builder &bld, uint8_t v)
5490 {
5491    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
5492    bld.MOV(tmp, brw_imm_uw(v));
5493    return tmp;
5494 }