src/intel/compiler/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "compiler/glsl/ir.h"
  25 #include "brw_fs.h"
  26 #include "brw_fs_surface_builder.h"
  27 #include "brw_nir.h"
  28
  29 using namespace brw;
  30 using namespace brw::surface_access;
  31
  32 void
  33 fs_visitor::emit_nir_code()
  34 {
  35    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  36     * be converted to reads/writes of these arrays
  37     */
  38    nir_setup_outputs();
  39    nir_setup_uniforms();
  40    nir_emit_system_values();
  41
  42    /* get the main function and emit it */
  43    nir_foreach_function(function, nir) {
  44       assert(strcmp(function->name, "main") == 0);
  45       assert(function->impl);
  46       nir_emit_impl(function->impl);
  47    }
  48 }
  49
  50 void
  51 fs_visitor::nir_setup_outputs()
  52 {
  53    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
  54       return;
  55
  56    nir_foreach_variable(var, &nir->outputs) {
  57       const unsigned vec4s =
  58          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
  59                            : type_size_vec4(var->type);
  60       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
  61       for (unsigned i = 0; i < vec4s; i++) {
  62          if (outputs[var->data.driver_location + i].file == BAD_FILE)
  63             outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
  64       }
  65    }
  66 }
  67
  68 void
  69 fs_visitor::nir_setup_uniforms()
  70 {
  71    if (dispatch_width != min_dispatch_width)
  72       return;
  73
  74    uniforms = nir->num_uniforms / 4;
  75 }
  76
  77 static bool
  78 emit_system_values_block(nir_block *block, fs_visitor *v)
  79 {
  80    fs_reg *reg;
  81
  82    nir_foreach_instr(instr, block) {
  83       if (instr->type != nir_instr_type_intrinsic)
  84          continue;
  85
  86       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
  87       switch (intrin->intrinsic) {
  88       case nir_intrinsic_load_vertex_id:
  89          unreachable("should be lowered by lower_vertex_id().");
  90
  91       case nir_intrinsic_load_vertex_id_zero_base:
  92          assert(v->stage == MESA_SHADER_VERTEX);
  93          reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
  94          if (reg->file == BAD_FILE)
  95             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
  96          break;
  97
  98       case nir_intrinsic_load_base_vertex:
  99          assert(v->stage == MESA_SHADER_VERTEX);
 100          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
 101          if (reg->file == BAD_FILE)
 102             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
 103          break;
 104
 105       case nir_intrinsic_load_instance_id:
 106          assert(v->stage == MESA_SHADER_VERTEX);
 107          reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
 108          if (reg->file == BAD_FILE)
 109             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
 110          break;
 111
 112       case nir_intrinsic_load_base_instance:
 113          assert(v->stage == MESA_SHADER_VERTEX);
 114          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
 115          if (reg->file == BAD_FILE)
 116             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
 117          break;
 118
 119       case nir_intrinsic_load_draw_id:
 120          assert(v->stage == MESA_SHADER_VERTEX);
 121          reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
 122          if (reg->file == BAD_FILE)
 123             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
 124          break;
 125
 126       case nir_intrinsic_load_invocation_id:
 127          if (v->stage == MESA_SHADER_TESS_CTRL)
 128             break;
 129          assert(v->stage == MESA_SHADER_GEOMETRY);
 130          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 131          if (reg->file == BAD_FILE) {
 132             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 133             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 134             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 135             abld.SHR(iid, g1, brw_imm_ud(27u));
 136             *reg = iid;
 137          }
 138          break;
 139
 140       case nir_intrinsic_load_sample_pos:
 141          assert(v->stage == MESA_SHADER_FRAGMENT);
 142          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 143          if (reg->file == BAD_FILE)
 144             *reg = *v->emit_samplepos_setup();
 145          break;
 146
 147       case nir_intrinsic_load_sample_id:
 148          assert(v->stage == MESA_SHADER_FRAGMENT);
 149          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 150          if (reg->file == BAD_FILE)
 151             *reg = *v->emit_sampleid_setup();
 152          break;
 153
 154       case nir_intrinsic_load_sample_mask_in:
 155          assert(v->stage == MESA_SHADER_FRAGMENT);
 156          assert(v->devinfo->gen >= 7);
 157          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 158          if (reg->file == BAD_FILE)
 159             *reg = *v->emit_samplemaskin_setup();
 160          break;
 161
 162       case nir_intrinsic_load_work_group_id:
 163          assert(v->stage == MESA_SHADER_COMPUTE);
 164          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 165          if (reg->file == BAD_FILE)
 166             *reg = *v->emit_cs_work_group_id_setup();
 167          break;
 168
 169       case nir_intrinsic_load_helper_invocation:
 170          assert(v->stage == MESA_SHADER_FRAGMENT);
 171          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
 172          if (reg->file == BAD_FILE) {
 173             const fs_builder abld =
 174                v->bld.annotate("gl_HelperInvocation", NULL);
 175
 176             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
 177              * pixel mask is in g1.7 of the thread payload.
 178              *
 179              * We move the per-channel pixel enable bit to the low bit of each
 180              * channel by shifting the byte containing the pixel mask by the
 181              * vector immediate 0x76543210UV.
 182              *
 183              * The region of <1,8,0> reads only 1 byte (the pixel masks for
 184              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
 185              * masks for 2 and 3) in SIMD16.
 186              */
 187             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
 188             abld.SHR(shifted,
 189                      stride(byte_offset(retype(brw_vec1_grf(1, 0),
 190                                                BRW_REGISTER_TYPE_UB), 28),
 191                             1, 8, 0),
 192                      brw_imm_v(0x76543210));
 193
 194             /* A set bit in the pixel mask means the channel is enabled, but
 195              * that is the opposite of gl_HelperInvocation so we need to invert
 196              * the mask.
 197              *
 198              * The negate source-modifier bit of logical instructions on Gen8+
 199              * performs 1's complement negation, so we can use that instead of
 200              * a NOT instruction.
 201              */
 202             fs_reg inverted = negate(shifted);
 203             if (v->devinfo->gen < 8) {
 204                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
 205                abld.NOT(inverted, shifted);
 206             }
 207
 208             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
 209              * with 1 and negating.
 210              */
 211             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 212             abld.AND(anded, inverted, brw_imm_uw(1));
 213
 214             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
 215             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
 216             *reg = dst;
 217          }
 218          break;
 219
 220       default:
 221          break;
 222       }
 223    }
 224
 225    return true;
 226 }
 227
 228 void
 229 fs_visitor::nir_emit_system_values()
 230 {
 231    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 232    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
 233       nir_system_values[i] = fs_reg();
 234    }
 235
 236    nir_foreach_function(function, nir) {
 237       assert(strcmp(function->name, "main") == 0);
 238       assert(function->impl);
 239       nir_foreach_block(block, function->impl) {
 240          emit_system_values_block(block, this);
 241       }
 242    }
 243 }
 244
 245 void
 246 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 247 {
 248    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
 249    for (unsigned i = 0; i < impl->reg_alloc; i++) {
 250       nir_locals[i] = fs_reg();
 251    }
 252
 253    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 254       unsigned array_elems =
 255          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 256       unsigned size = array_elems * reg->num_components;
 257       const brw_reg_type reg_type =
 258          reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
 259       nir_locals[reg->index] = bld.vgrf(reg_type, size);
 260    }
 261
 262    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 263                              impl->ssa_alloc);
 264
 265    nir_emit_cf_list(&impl->body);
 266 }
 267
 268 void
 269 fs_visitor::nir_emit_cf_list(exec_list *list)
 270 {
 271    exec_list_validate(list);
 272    foreach_list_typed(nir_cf_node, node, node, list) {
 273       switch (node->type) {
 274       case nir_cf_node_if:
 275          nir_emit_if(nir_cf_node_as_if(node));
 276          break;
 277
 278       case nir_cf_node_loop:
 279          nir_emit_loop(nir_cf_node_as_loop(node));
 280          break;
 281
 282       case nir_cf_node_block:
 283          nir_emit_block(nir_cf_node_as_block(node));
 284          break;
 285
 286       default:
 287          unreachable("Invalid CFG node block");
 288       }
 289    }
 290 }
 291
 292 void
 293 fs_visitor::nir_emit_if(nir_if *if_stmt)
 294 {
 295    /* first, put the condition into f0 */
 296    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 297                             retype(get_nir_src(if_stmt->condition),
 298                                    BRW_REGISTER_TYPE_D));
 299    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 300
 301    bld.IF(BRW_PREDICATE_NORMAL);
 302
 303    nir_emit_cf_list(&if_stmt->then_list);
 304
 305    /* note: if the else is empty, dead CF elimination will remove it */
 306    bld.emit(BRW_OPCODE_ELSE);
 307
 308    nir_emit_cf_list(&if_stmt->else_list);
 309
 310    bld.emit(BRW_OPCODE_ENDIF);
 311 }
 312
 313 void
 314 fs_visitor::nir_emit_loop(nir_loop *loop)
 315 {
 316    bld.emit(BRW_OPCODE_DO);
 317
 318    nir_emit_cf_list(&loop->body);
 319
 320    bld.emit(BRW_OPCODE_WHILE);
 321 }
 322
 323 void
 324 fs_visitor::nir_emit_block(nir_block *block)
 325 {
 326    nir_foreach_instr(instr, block) {
 327       nir_emit_instr(instr);
 328    }
 329 }
 330
 331 void
 332 fs_visitor::nir_emit_instr(nir_instr *instr)
 333 {
 334    const fs_builder abld = bld.annotate(NULL, instr);
 335
 336    switch (instr->type) {
 337    case nir_instr_type_alu:
 338       nir_emit_alu(abld, nir_instr_as_alu(instr));
 339       break;
 340
 341    case nir_instr_type_intrinsic:
 342       switch (stage) {
 343       case MESA_SHADER_VERTEX:
 344          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 345          break;
 346       case MESA_SHADER_TESS_CTRL:
 347          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 348          break;
 349       case MESA_SHADER_TESS_EVAL:
 350          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
 351          break;
 352       case MESA_SHADER_GEOMETRY:
 353          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 354          break;
 355       case MESA_SHADER_FRAGMENT:
 356          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 357          break;
 358       case MESA_SHADER_COMPUTE:
 359          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 360          break;
 361       default:
 362          unreachable("unsupported shader stage");
 363       }
 364       break;
 365
 366    case nir_instr_type_tex:
 367       nir_emit_texture(abld, nir_instr_as_tex(instr));
 368       break;
 369
 370    case nir_instr_type_load_const:
 371       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 372       break;
 373
 374    case nir_instr_type_ssa_undef:
 375       /* We create a new VGRF for undefs on every use (by handling
 376        * them in get_nir_src()), rather than for each definition.
 377        * This helps register coalescing eliminate MOVs from undef.
 378        */
 379       break;
 380
 381    case nir_instr_type_jump:
 382       nir_emit_jump(abld, nir_instr_as_jump(instr));
 383       break;
 384
 385    default:
 386       unreachable("unknown instruction type");
 387    }
 388 }
 389
 390 /**
 391  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
 392  * match instr.
 393  */
 394 bool
 395 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 396                                       const fs_reg &result)
 397 {
 398    if (!instr->src[0].src.is_ssa ||
 399        !instr->src[0].src.ssa->parent_instr)
 400       return false;
 401
 402    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
 403       return false;
 404
 405    nir_alu_instr *src0 =
 406       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
 407
 408    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
 409        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
 410       return false;
 411
 412    nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
 413    assert(element != NULL);
 414
 415    /* Element type to extract.*/
 416    const brw_reg_type type = brw_int_type(
 417       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
 418       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
 419
 420    fs_reg op0 = get_nir_src(src0->src[0].src);
 421    op0.type = brw_type_for_nir_type(devinfo,
 422       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
 423                      nir_src_bit_size(src0->src[0].src)));
 424    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 425
 426    set_saturate(instr->dest.saturate,
 427                 bld.MOV(result, subscript(op0, type, element->u32[0])));
 428    return true;
 429 }
 430
 431 bool
 432 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 433                                          const fs_reg &result)
 434 {
 435    if (!instr->src[0].src.is_ssa ||
 436        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
 437       return false;
 438
 439    nir_intrinsic_instr *src0 =
 440       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 441
 442    if (src0->intrinsic != nir_intrinsic_load_front_face)
 443       return false;
 444
 445    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
 446    if (!value1 || fabsf(value1->f32[0]) != 1.0f)
 447       return false;
 448
 449    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
 450    if (!value2 || fabsf(value2->f32[0]) != 1.0f)
 451       return false;
 452
 453    fs_reg tmp = vgrf(glsl_type::int_type);
 454
 455    if (devinfo->gen >= 6) {
 456       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 457       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 458
 459       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 460        *
 461        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 462        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 463        *
 464        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 465        *
 466        * This negation looks like it's safe in practice, because bits 0:4 will
 467        * surely be TRIANGLES
 468        */
 469
 470       if (value1->f32[0] == -1.0f) {
 471          g0.negate = true;
 472       }
 473
 474       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
 475              g0, brw_imm_uw(0x3f80));
 476    } else {
 477       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 478       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 479
 480       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 481        *
 482        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 483        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 484        *
 485        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 486        *
 487        * This negation looks like it's safe in practice, because bits 0:4 will
 488        * surely be TRIANGLES
 489        */
 490
 491       if (value1->f32[0] == -1.0f) {
 492          g1_6.negate = true;
 493       }
 494
 495       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
 496    }
 497    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 498
 499    return true;
 500 }
 501
 502 static void
 503 emit_find_msb_using_lzd(const fs_builder &bld,
 504                         const fs_reg &result,
 505                         const fs_reg &src,
 506                         bool is_signed)
 507 {
 508    fs_inst *inst;
 509    fs_reg temp = src;
 510
 511    if (is_signed) {
 512       /* LZD of an absolute value source almost always does the right
 513        * thing.  There are two problem values:
 514        *
 515        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
 516        *   0.  However, findMSB(int(0x80000000)) == 30.
 517        *
 518        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
 519        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
 520        *
 521        *    For a value of zero or negative one, -1 will be returned.
 522        *
 523        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
 524        *   findMSB(-(1<<x)) should return x-1.
 525        *
 526        * For all negative number cases, including 0x80000000 and
 527        * 0xffffffff, the correct value is obtained from LZD if instead of
 528        * negating the (already negative) value the logical-not is used.  A
 529        * conditonal logical-not can be achieved in two instructions.
 530        */
 531       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
 532
 533       bld.ASR(temp, src, brw_imm_d(31));
 534       bld.XOR(temp, temp, src);
 535    }
 536
 537    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
 538            retype(temp, BRW_REGISTER_TYPE_UD));
 539
 540    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
 541     * from the LSB side. Subtract the result from 31 to convert the MSB
 542     * count into an LSB count.  If no bits are set, LZD will return 32.
 543     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
 544     */
 545    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
 546    inst->src[0].negate = true;
 547 }
 548
 549 void
 550 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 551 {
 552    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 553    fs_inst *inst;
 554
 555    fs_reg result = get_nir_dest(instr->dest.dest);
 556    result.type = brw_type_for_nir_type(devinfo,
 557       (nir_alu_type)(nir_op_infos[instr->op].output_type |
 558                      nir_dest_bit_size(instr->dest.dest)));
 559
 560    fs_reg op[4];
 561    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 562       op[i] = get_nir_src(instr->src[i].src);
 563       op[i].type = brw_type_for_nir_type(devinfo,
 564          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
 565                         nir_src_bit_size(instr->src[i].src)));
 566       op[i].abs = instr->src[i].abs;
 567       op[i].negate = instr->src[i].negate;
 568    }
 569
 570    /* We get a bunch of mov's out of the from_ssa pass and they may still
 571     * be vectorized.  We'll handle them as a special-case.  We'll also
 572     * handle vecN here because it's basically the same thing.
 573     */
 574    switch (instr->op) {
 575    case nir_op_imov:
 576    case nir_op_fmov:
 577    case nir_op_vec2:
 578    case nir_op_vec3:
 579    case nir_op_vec4: {
 580       fs_reg temp = result;
 581       bool need_extra_copy = false;
 582       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 583          if (!instr->src[i].src.is_ssa &&
 584              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
 585             need_extra_copy = true;
 586             temp = bld.vgrf(result.type, 4);
 587             break;
 588          }
 589       }
 590
 591       for (unsigned i = 0; i < 4; i++) {
 592          if (!(instr->dest.write_mask & (1 << i)))
 593             continue;
 594
 595          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
 596             inst = bld.MOV(offset(temp, bld, i),
 597                            offset(op[0], bld, instr->src[0].swizzle[i]));
 598          } else {
 599             inst = bld.MOV(offset(temp, bld, i),
 600                            offset(op[i], bld, instr->src[i].swizzle[0]));
 601          }
 602          inst->saturate = instr->dest.saturate;
 603       }
 604
 605       /* In this case the source and destination registers were the same,
 606        * so we need to insert an extra set of moves in order to deal with
 607        * any swizzling.
 608        */
 609       if (need_extra_copy) {
 610          for (unsigned i = 0; i < 4; i++) {
 611             if (!(instr->dest.write_mask & (1 << i)))
 612                continue;
 613
 614             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
 615          }
 616       }
 617       return;
 618    }
 619    default:
 620       break;
 621    }
 622
 623    /* At this point, we have dealt with any instruction that operates on
 624     * more than a single channel.  Therefore, we can just adjust the source
 625     * and destination registers for that channel and emit the instruction.
 626     */
 627    unsigned channel = 0;
 628    if (nir_op_infos[instr->op].output_size == 0) {
 629       /* Since NIR is doing the scalarizing for us, we should only ever see
 630        * vectorized operations with a single channel.
 631        */
 632       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
 633       channel = ffs(instr->dest.write_mask) - 1;
 634
 635       result = offset(result, bld, channel);
 636    }
 637
 638    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 639       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 640       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 641    }
 642
 643    switch (instr->op) {
 644    case nir_op_i2f:
 645    case nir_op_u2f:
 646       if (optimize_extract_to_float(instr, result))
 647          return;
 648       inst = bld.MOV(result, op[0]);
 649       inst->saturate = instr->dest.saturate;
 650       break;
 651
 652    case nir_op_f2d:
 653    case nir_op_i2d:
 654    case nir_op_u2d:
 655       /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
 656        *
 657        *    "When source or destination is 64b (...), regioning in Align1
 658        *     must follow these rules:
 659        *
 660        *     1. Source and destination horizontal stride must be aligned to
 661        *        the same qword.
 662        *     (...)"
 663        *
 664        * This means that 32-bit to 64-bit conversions need to have the 32-bit
 665        * data elements aligned to 64-bit. This restriction does not apply to
 666        * BDW and later.
 667        */
 668       if (nir_dest_bit_size(instr->dest.dest) == 64 &&
 669           nir_src_bit_size(instr->src[0].src) == 32 &&
 670           (devinfo->is_cherryview || devinfo->is_broxton)) {
 671          fs_reg tmp = bld.vgrf(result.type, 1);
 672          tmp = subscript(tmp, op[0].type, 0);
 673          inst = bld.MOV(tmp, op[0]);
 674          inst = bld.MOV(result, tmp);
 675          inst->saturate = instr->dest.saturate;
 676          break;
 677       }
 678       /* fallthrough */
 679    case nir_op_i642d:
 680    case nir_op_u642d:
 681    case nir_op_f2i64:
 682    case nir_op_f2u64:
 683    case nir_op_i2i64:
 684    case nir_op_i2u64:
 685    case nir_op_u2i64:
 686    case nir_op_u2u64:
 687    case nir_op_d2f:
 688    case nir_op_d2i:
 689    case nir_op_d2u:
 690    case nir_op_i642f:
 691    case nir_op_u642f:
 692    case nir_op_u2i32:
 693    case nir_op_i2i32:
 694    case nir_op_u2u32:
 695    case nir_op_i2u32:
 696    case nir_op_f2i:
 697    case nir_op_f2u:
 698       inst = bld.MOV(result, op[0]);
 699       inst->saturate = instr->dest.saturate;
 700       break;
 701
 702    case nir_op_fsign: {
 703       if (op[0].abs) {
 704          /* Straightforward since the source can be assumed to be
 705           * non-negative.
 706           */
 707          set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
 708          set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(result, brw_imm_f(1.0f)));
 709
 710       } else if (type_sz(op[0].type) < 8) {
 711          /* AND(val, 0x80000000) gives the sign bit.
 712           *
 713           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 714           * zero.
 715           */
 716          bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 717
 718          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 719          op[0].type = BRW_REGISTER_TYPE_UD;
 720          result.type = BRW_REGISTER_TYPE_UD;
 721          bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
 722
 723          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
 724          inst->predicate = BRW_PREDICATE_NORMAL;
 725          if (instr->dest.saturate) {
 726             inst = bld.MOV(result, result);
 727             inst->saturate = true;
 728          }
 729       } else {
 730          /* For doubles we do the same but we need to consider:
 731           *
 732           * - 2-src instructions can't operate with 64-bit immediates
 733           * - The sign is encoded in the high 32-bit of each DF
 734           * - CMP with DF requires special handling in SIMD16
 735           * - We need to produce a DF result.
 736           */
 737
 738          /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
 739           * a register and compare with that.
 740           */
 741          fs_reg tmp = vgrf(glsl_type::double_type);
 742          bld.MOV(tmp, setup_imm_df(bld, 0.0));
 743
 744          /* A direct DF CMP using the flag register (null dst) won't work in
 745           * SIMD16 because the CMP will be split in two by lower_simd_width,
 746           * resulting in two CMP instructions with the same dst (NULL),
 747           * leading to dead code elimination of the first one. In SIMD8,
 748           * however, there is no need to split the CMP and we can save some
 749           * work.
 750           */
 751          fs_reg dst_tmp = vgrf(glsl_type::double_type);
 752          bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
 753
 754          /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
 755           * so we store the result of the comparison in a vgrf instead and
 756           * then we generate a UD comparison from that that won't have to
 757           * be split by lower_simd_width. This is what NIR does to handle
 758           * double comparisons in the general case.
 759           */
 760          if (bld.dispatch_width() == 16 ) {
 761             fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
 762             bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
 763             bld.CMP(bld.null_reg_ud(),
 764                     dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
 765          }
 766
 767          /* Get the high 32-bit of each double component where the sign is */
 768          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 769          bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
 770
 771          /* Get the sign bit */
 772          bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
 773
 774          /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
 775          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
 776          inst->predicate = BRW_PREDICATE_NORMAL;
 777
 778          /* Convert from 32-bit float to 64-bit double */
 779          result.type = BRW_REGISTER_TYPE_DF;
 780          inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
 781
 782          if (instr->dest.saturate) {
 783             inst = bld.MOV(result, result);
 784             inst->saturate = true;
 785          }
 786       }
 787       break;
 788    }
 789
 790    case nir_op_isign:
 791       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 792        *               -> non-negative val generates 0x00000000.
 793        *  Predicated OR sets 1 if val is positive.
 794        */
 795       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 796       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
 797       bld.ASR(result, op[0], brw_imm_d(31));
 798       inst = bld.OR(result, result, brw_imm_d(1));
 799       inst->predicate = BRW_PREDICATE_NORMAL;
 800       break;
 801
 802    case nir_op_frcp:
 803       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
 804       inst->saturate = instr->dest.saturate;
 805       break;
 806
 807    case nir_op_fexp2:
 808       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
 809       inst->saturate = instr->dest.saturate;
 810       break;
 811
 812    case nir_op_flog2:
 813       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
 814       inst->saturate = instr->dest.saturate;
 815       break;
 816
 817    case nir_op_fsin:
 818       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
 819       inst->saturate = instr->dest.saturate;
 820       break;
 821
 822    case nir_op_fcos:
 823       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
 824       inst->saturate = instr->dest.saturate;
 825       break;
 826
 827    case nir_op_fddx:
 828       if (fs_key->high_quality_derivatives) {
 829          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 830       } else {
 831          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 832       }
 833       inst->saturate = instr->dest.saturate;
 834       break;
 835    case nir_op_fddx_fine:
 836       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 837       inst->saturate = instr->dest.saturate;
 838       break;
 839    case nir_op_fddx_coarse:
 840       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 841       inst->saturate = instr->dest.saturate;
 842       break;
 843    case nir_op_fddy:
 844       if (fs_key->high_quality_derivatives) {
 845          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
 846       } else {
 847          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
 848       }
 849       inst->saturate = instr->dest.saturate;
 850       break;
 851    case nir_op_fddy_fine:
 852       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
 853       inst->saturate = instr->dest.saturate;
 854       break;
 855    case nir_op_fddy_coarse:
 856       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
 857       inst->saturate = instr->dest.saturate;
 858       break;
 859
 860    case nir_op_iadd:
 861    case nir_op_fadd:
 862       inst = bld.ADD(result, op[0], op[1]);
 863       inst->saturate = instr->dest.saturate;
 864       break;
 865
 866    case nir_op_fmul:
 867       inst = bld.MUL(result, op[0], op[1]);
 868       inst->saturate = instr->dest.saturate;
 869       break;
 870
 871    case nir_op_imul:
 872       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 873       bld.MUL(result, op[0], op[1]);
 874       break;
 875
 876    case nir_op_imul_high:
 877    case nir_op_umul_high:
 878       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 879       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
 880       break;
 881
 882    case nir_op_idiv:
 883    case nir_op_udiv:
 884       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 885       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
 886       break;
 887
 888    case nir_op_uadd_carry:
 889       unreachable("Should have been lowered by carry_to_arith().");
 890
 891    case nir_op_usub_borrow:
 892       unreachable("Should have been lowered by borrow_to_arith().");
 893
 894    case nir_op_umod:
 895    case nir_op_irem:
 896       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
 897        * appears that our hardware just does the right thing for signed
 898        * remainder.
 899        */
 900       assert(nir_dest_bit_size(instr->dest.dest) < 64);
 901       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
 902       break;
 903
 904    case nir_op_imod: {
 905       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
 906       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
 907
 908       /* Math instructions don't support conditional mod */
 909       inst = bld.MOV(bld.null_reg_d(), result);
 910       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 911
 912       /* Now, we need to determine if signs of the sources are different.
 913        * When we XOR the sources, the top bit is 0 if they are the same and 1
 914        * if they are different.  We can then use a conditional modifier to
 915        * turn that into a predicate.  This leads us to an XOR.l instruction.
 916        *
 917        * Technically, according to the PRM, you're not allowed to use .l on a
 918        * XOR instruction.  However, emperical experiments and Curro's reading
 919        * of the simulator source both indicate that it's safe.
 920        */
 921       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
 922       inst = bld.XOR(tmp, op[0], op[1]);
 923       inst->predicate = BRW_PREDICATE_NORMAL;
 924       inst->conditional_mod = BRW_CONDITIONAL_L;
 925
 926       /* If the result of the initial remainder operation is non-zero and the
 927        * two sources have different signs, add in a copy of op[1] to get the
 928        * final integer modulus value.
 929        */
 930       inst = bld.ADD(result, result, op[1]);
 931       inst->predicate = BRW_PREDICATE_NORMAL;
 932       break;
 933    }
 934
 935    case nir_op_flt:
 936    case nir_op_fge:
 937    case nir_op_feq:
 938    case nir_op_fne: {
 939       fs_reg dest = result;
 940       if (nir_src_bit_size(instr->src[0].src) > 32) {
 941          dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
 942       }
 943       brw_conditional_mod cond;
 944       switch (instr->op) {
 945       case nir_op_flt:
 946          cond = BRW_CONDITIONAL_L;
 947          break;
 948       case nir_op_fge:
 949          cond = BRW_CONDITIONAL_GE;
 950          break;
 951       case nir_op_feq:
 952          cond = BRW_CONDITIONAL_Z;
 953          break;
 954       case nir_op_fne:
 955          cond = BRW_CONDITIONAL_NZ;
 956          break;
 957       default:
 958          unreachable("bad opcode");
 959       }
 960       bld.CMP(dest, op[0], op[1], cond);
 961       if (nir_src_bit_size(instr->src[0].src) > 32) {
 962          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
 963       }
 964       break;
 965    }
 966
 967    case nir_op_ilt:
 968    case nir_op_ult:
 969    case nir_op_ige:
 970    case nir_op_uge:
 971    case nir_op_ieq:
 972    case nir_op_ine: {
 973       fs_reg dest = result;
 974       if (nir_src_bit_size(instr->src[0].src) > 32) {
 975          dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
 976       }
 977
 978       brw_conditional_mod cond;
 979       switch (instr->op) {
 980       case nir_op_ilt:
 981       case nir_op_ult:
 982          cond = BRW_CONDITIONAL_L;
 983          break;
 984       case nir_op_ige:
 985       case nir_op_uge:
 986          cond = BRW_CONDITIONAL_GE;
 987          break;
 988       case nir_op_ieq:
 989          cond = BRW_CONDITIONAL_Z;
 990          break;
 991       case nir_op_ine:
 992          cond = BRW_CONDITIONAL_NZ;
 993          break;
 994       default:
 995          unreachable("bad opcode");
 996       }
 997       bld.CMP(dest, op[0], op[1], cond);
 998       if (nir_src_bit_size(instr->src[0].src) > 32) {
 999          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1000       }
1001       break;
1002    }
1003
1004    case nir_op_inot:
1005       if (devinfo->gen >= 8) {
1006          op[0] = resolve_source_modifiers(op[0]);
1007       }
1008       bld.NOT(result, op[0]);
1009       break;
1010    case nir_op_ixor:
1011       if (devinfo->gen >= 8) {
1012          op[0] = resolve_source_modifiers(op[0]);
1013          op[1] = resolve_source_modifiers(op[1]);
1014       }
1015       bld.XOR(result, op[0], op[1]);
1016       break;
1017    case nir_op_ior:
1018       if (devinfo->gen >= 8) {
1019          op[0] = resolve_source_modifiers(op[0]);
1020          op[1] = resolve_source_modifiers(op[1]);
1021       }
1022       bld.OR(result, op[0], op[1]);
1023       break;
1024    case nir_op_iand:
1025       if (devinfo->gen >= 8) {
1026          op[0] = resolve_source_modifiers(op[0]);
1027          op[1] = resolve_source_modifiers(op[1]);
1028       }
1029       bld.AND(result, op[0], op[1]);
1030       break;
1031
1032    case nir_op_fdot2:
1033    case nir_op_fdot3:
1034    case nir_op_fdot4:
1035    case nir_op_ball_fequal2:
1036    case nir_op_ball_iequal2:
1037    case nir_op_ball_fequal3:
1038    case nir_op_ball_iequal3:
1039    case nir_op_ball_fequal4:
1040    case nir_op_ball_iequal4:
1041    case nir_op_bany_fnequal2:
1042    case nir_op_bany_inequal2:
1043    case nir_op_bany_fnequal3:
1044    case nir_op_bany_inequal3:
1045    case nir_op_bany_fnequal4:
1046    case nir_op_bany_inequal4:
1047       unreachable("Lowered by nir_lower_alu_reductions");
1048
1049    case nir_op_fnoise1_1:
1050    case nir_op_fnoise1_2:
1051    case nir_op_fnoise1_3:
1052    case nir_op_fnoise1_4:
1053    case nir_op_fnoise2_1:
1054    case nir_op_fnoise2_2:
1055    case nir_op_fnoise2_3:
1056    case nir_op_fnoise2_4:
1057    case nir_op_fnoise3_1:
1058    case nir_op_fnoise3_2:
1059    case nir_op_fnoise3_3:
1060    case nir_op_fnoise3_4:
1061    case nir_op_fnoise4_1:
1062    case nir_op_fnoise4_2:
1063    case nir_op_fnoise4_3:
1064    case nir_op_fnoise4_4:
1065       unreachable("not reached: should be handled by lower_noise");
1066
1067    case nir_op_ldexp:
1068       unreachable("not reached: should be handled by ldexp_to_arith()");
1069
1070    case nir_op_fsqrt:
1071       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1072       inst->saturate = instr->dest.saturate;
1073       break;
1074
1075    case nir_op_frsq:
1076       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1077       inst->saturate = instr->dest.saturate;
1078       break;
1079
1080    case nir_op_b2i64:
1081    case nir_op_b2i:
1082    case nir_op_b2f:
1083       bld.MOV(result, negate(op[0]));
1084       break;
1085
1086    case nir_op_i2b:
1087    case nir_op_f2b:
1088    case nir_op_i642b:
1089    case nir_op_d2b:
1090       if (nir_src_bit_size(instr->src[0].src) == 64) {
1091          /* two-argument instructions can't take 64-bit immediates */
1092          fs_reg zero;
1093          fs_reg tmp;
1094
1095          if (instr->op == nir_op_d2b) {
1096             zero = vgrf(glsl_type::double_type);
1097             tmp = vgrf(glsl_type::double_type);
1098          } else {
1099             zero = vgrf(glsl_type::int64_t_type);
1100             tmp = vgrf(glsl_type::int64_t_type);
1101          }
1102
1103          bld.MOV(zero, setup_imm_df(bld, 0.0));
1104          /* A SIMD16 execution needs to be split in two instructions, so use
1105           * a vgrf instead of the flag register as dst so instruction splitting
1106           * works
1107           */
1108          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1109          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1110       } else {
1111          if (instr->op == nir_op_f2b) {
1112             bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
1113          } else {
1114             bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1115          }
1116       }
1117       break;
1118
1119    case nir_op_ftrunc:
1120       inst = bld.RNDZ(result, op[0]);
1121       inst->saturate = instr->dest.saturate;
1122       break;
1123
1124    case nir_op_fceil: {
1125       op[0].negate = !op[0].negate;
1126       fs_reg temp = vgrf(glsl_type::float_type);
1127       bld.RNDD(temp, op[0]);
1128       temp.negate = true;
1129       inst = bld.MOV(result, temp);
1130       inst->saturate = instr->dest.saturate;
1131       break;
1132    }
1133    case nir_op_ffloor:
1134       inst = bld.RNDD(result, op[0]);
1135       inst->saturate = instr->dest.saturate;
1136       break;
1137    case nir_op_ffract:
1138       inst = bld.FRC(result, op[0]);
1139       inst->saturate = instr->dest.saturate;
1140       break;
1141    case nir_op_fround_even:
1142       inst = bld.RNDE(result, op[0]);
1143       inst->saturate = instr->dest.saturate;
1144       break;
1145
1146    case nir_op_fquantize2f16: {
1147       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1148       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1149       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1150
1151       /* The destination stride must be at least as big as the source stride. */
1152       tmp16.type = BRW_REGISTER_TYPE_W;
1153       tmp16.stride = 2;
1154
1155       /* Check for denormal */
1156       fs_reg abs_src0 = op[0];
1157       abs_src0.abs = true;
1158       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1159               BRW_CONDITIONAL_L);
1160       /* Get the appropriately signed zero */
1161       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1162               retype(op[0], BRW_REGISTER_TYPE_UD),
1163               brw_imm_ud(0x80000000));
1164       /* Do the actual F32 -> F16 -> F32 conversion */
1165       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1166       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1167       /* Select that or zero based on normal status */
1168       inst = bld.SEL(result, zero, tmp32);
1169       inst->predicate = BRW_PREDICATE_NORMAL;
1170       inst->saturate = instr->dest.saturate;
1171       break;
1172    }
1173
1174    case nir_op_imin:
1175    case nir_op_umin:
1176    case nir_op_fmin:
1177       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1178       inst->saturate = instr->dest.saturate;
1179       break;
1180
1181    case nir_op_imax:
1182    case nir_op_umax:
1183    case nir_op_fmax:
1184       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1185       inst->saturate = instr->dest.saturate;
1186       break;
1187
1188    case nir_op_pack_snorm_2x16:
1189    case nir_op_pack_snorm_4x8:
1190    case nir_op_pack_unorm_2x16:
1191    case nir_op_pack_unorm_4x8:
1192    case nir_op_unpack_snorm_2x16:
1193    case nir_op_unpack_snorm_4x8:
1194    case nir_op_unpack_unorm_2x16:
1195    case nir_op_unpack_unorm_4x8:
1196    case nir_op_unpack_half_2x16:
1197    case nir_op_pack_half_2x16:
1198       unreachable("not reached: should be handled by lower_packing_builtins");
1199
1200    case nir_op_unpack_half_2x16_split_x:
1201       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
1202       inst->saturate = instr->dest.saturate;
1203       break;
1204    case nir_op_unpack_half_2x16_split_y:
1205       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
1206       inst->saturate = instr->dest.saturate;
1207       break;
1208
1209    case nir_op_pack_64_2x32_split:
1210       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1211       break;
1212
1213    case nir_op_unpack_64_2x32_split_x:
1214    case nir_op_unpack_64_2x32_split_y: {
1215       if (instr->op == nir_op_unpack_64_2x32_split_x)
1216          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1217       else
1218          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1219       break;
1220    }
1221
1222    case nir_op_fpow:
1223       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1224       inst->saturate = instr->dest.saturate;
1225       break;
1226
1227    case nir_op_bitfield_reverse:
1228       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1229       bld.BFREV(result, op[0]);
1230       break;
1231
1232    case nir_op_bit_count:
1233       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1234       bld.CBIT(result, op[0]);
1235       break;
1236
1237    case nir_op_ufind_msb: {
1238       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1239       emit_find_msb_using_lzd(bld, result, op[0], false);
1240       break;
1241    }
1242
1243    case nir_op_ifind_msb: {
1244       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1245
1246       if (devinfo->gen < 7) {
1247          emit_find_msb_using_lzd(bld, result, op[0], true);
1248       } else {
1249          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1250
1251          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1252           * count from the LSB side. If FBH didn't return an error
1253           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1254           * count into an LSB count.
1255           */
1256          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1257
1258          inst = bld.ADD(result, result, brw_imm_d(31));
1259          inst->predicate = BRW_PREDICATE_NORMAL;
1260          inst->src[0].negate = true;
1261       }
1262       break;
1263    }
1264
1265    case nir_op_find_lsb:
1266       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1267
1268       if (devinfo->gen < 7) {
1269          fs_reg temp = vgrf(glsl_type::int_type);
1270
1271          /* (x & -x) generates a value that consists of only the LSB of x.
1272           * For all powers of 2, findMSB(y) == findLSB(y).
1273           */
1274          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1275          fs_reg negated_src = src;
1276
1277          /* One must be negated, and the other must be non-negated.  It
1278           * doesn't matter which is which.
1279           */
1280          negated_src.negate = true;
1281          src.negate = false;
1282
1283          bld.AND(temp, src, negated_src);
1284          emit_find_msb_using_lzd(bld, result, temp, false);
1285       } else {
1286          bld.FBL(result, op[0]);
1287       }
1288       break;
1289
1290    case nir_op_ubitfield_extract:
1291    case nir_op_ibitfield_extract:
1292       unreachable("should have been lowered");
1293    case nir_op_ubfe:
1294    case nir_op_ibfe:
1295       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1296       bld.BFE(result, op[2], op[1], op[0]);
1297       break;
1298    case nir_op_bfm:
1299       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1300       bld.BFI1(result, op[0], op[1]);
1301       break;
1302    case nir_op_bfi:
1303       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1304       bld.BFI2(result, op[0], op[1], op[2]);
1305       break;
1306
1307    case nir_op_bitfield_insert:
1308       unreachable("not reached: should have been lowered");
1309
1310    case nir_op_ishl:
1311       bld.SHL(result, op[0], op[1]);
1312       break;
1313    case nir_op_ishr:
1314       bld.ASR(result, op[0], op[1]);
1315       break;
1316    case nir_op_ushr:
1317       bld.SHR(result, op[0], op[1]);
1318       break;
1319
1320    case nir_op_pack_half_2x16_split:
1321       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1322       break;
1323
1324    case nir_op_ffma:
1325       inst = bld.MAD(result, op[2], op[1], op[0]);
1326       inst->saturate = instr->dest.saturate;
1327       break;
1328
1329    case nir_op_flrp:
1330       inst = bld.LRP(result, op[0], op[1], op[2]);
1331       inst->saturate = instr->dest.saturate;
1332       break;
1333
1334    case nir_op_bcsel:
1335       if (optimize_frontfacing_ternary(instr, result))
1336          return;
1337
1338       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1339       inst = bld.SEL(result, op[1], op[2]);
1340       inst->predicate = BRW_PREDICATE_NORMAL;
1341       break;
1342
1343    case nir_op_extract_u8:
1344    case nir_op_extract_i8: {
1345       const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1346       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
1347       assert(byte != NULL);
1348       bld.MOV(result, subscript(op[0], type, byte->u32[0]));
1349       break;
1350    }
1351
1352    case nir_op_extract_u16:
1353    case nir_op_extract_i16: {
1354       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1355       nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
1356       assert(word != NULL);
1357       bld.MOV(result, subscript(op[0], type, word->u32[0]));
1358       break;
1359    }
1360
1361    default:
1362       unreachable("unhandled instruction");
1363    }
1364
1365    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1366     * to sign extend the low bit to 0/~0
1367     */
1368    if (devinfo->gen <= 5 &&
1369        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1370       fs_reg masked = vgrf(glsl_type::int_type);
1371       bld.AND(masked, result, brw_imm_d(1));
1372       masked.negate = true;
1373       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1374    }
1375 }
1376
1377 void
1378 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1379                                 nir_load_const_instr *instr)
1380 {
1381    const brw_reg_type reg_type =
1382       instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1383    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1384
1385    switch (instr->def.bit_size) {
1386    case 32:
1387       for (unsigned i = 0; i < instr->def.num_components; i++)
1388          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
1389       break;
1390
1391    case 64:
1392       for (unsigned i = 0; i < instr->def.num_components; i++)
1393          bld.MOV(offset(reg, bld, i),
1394                  setup_imm_df(bld, instr->value.f64[i]));
1395       break;
1396
1397    default:
1398       unreachable("Invalid bit size");
1399    }
1400
1401    nir_ssa_values[instr->def.index] = reg;
1402 }
1403
1404 fs_reg
1405 fs_visitor::get_nir_src(const nir_src &src)
1406 {
1407    fs_reg reg;
1408    if (src.is_ssa) {
1409       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1410          const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
1411             BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
1412          reg = bld.vgrf(reg_type, src.ssa->num_components);
1413       } else {
1414          reg = nir_ssa_values[src.ssa->index];
1415       }
1416    } else {
1417       /* We don't handle indirects on locals */
1418       assert(src.reg.indirect == NULL);
1419       reg = offset(nir_locals[src.reg.reg->index], bld,
1420                    src.reg.base_offset * src.reg.reg->num_components);
1421    }
1422
1423    /* to avoid floating-point denorm flushing problems, set the type by
1424     * default to D - instructions that need floating point semantics will set
1425     * this to F if they need to
1426     */
1427    return retype(reg, BRW_REGISTER_TYPE_D);
1428 }
1429
1430 /**
1431  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1432  */
1433 fs_reg
1434 fs_visitor::get_nir_src_imm(const nir_src &src)
1435 {
1436    nir_const_value *val = nir_src_as_const_value(src);
1437    return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
1438 }
1439
1440 fs_reg
1441 fs_visitor::get_nir_dest(const nir_dest &dest)
1442 {
1443    if (dest.is_ssa) {
1444       const brw_reg_type reg_type =
1445          dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
1446       nir_ssa_values[dest.ssa.index] =
1447          bld.vgrf(reg_type, dest.ssa.num_components);
1448       return nir_ssa_values[dest.ssa.index];
1449    } else {
1450       /* We don't handle indirects on locals */
1451       assert(dest.reg.indirect == NULL);
1452       return offset(nir_locals[dest.reg.reg->index], bld,
1453                     dest.reg.base_offset * dest.reg.reg->num_components);
1454    }
1455 }
1456
1457 fs_reg
1458 fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
1459 {
1460    fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
1461                 BRW_REGISTER_TYPE_UD);
1462    fs_reg indirect;
1463    unsigned indirect_max = 0;
1464
1465    for (const nir_deref *tail = &deref->deref; tail->child;
1466         tail = tail->child) {
1467       const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
1468       assert(tail->child->deref_type == nir_deref_type_array);
1469       const unsigned size = glsl_get_length(tail->type);
1470       const unsigned element_size = type_size_scalar(deref_array->deref.type);
1471       const unsigned base = MIN2(deref_array->base_offset, size - 1);
1472       image = offset(image, bld, base * element_size);
1473
1474       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
1475          fs_reg tmp = vgrf(glsl_type::uint_type);
1476
1477          /* Accessing an invalid surface index with the dataport can result
1478           * in a hang.  According to the spec "if the index used to
1479           * select an individual element is negative or greater than or
1480           * equal to the size of the array, the results of the operation
1481           * are undefined but may not lead to termination" -- which is one
1482           * of the possible outcomes of the hang.  Clamp the index to
1483           * prevent access outside of the array bounds.
1484           */
1485          bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
1486                                      BRW_REGISTER_TYPE_UD),
1487                          brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
1488
1489          indirect_max += element_size * (tail->type->length - 1);
1490
1491          bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
1492          if (indirect.file == BAD_FILE) {
1493             indirect = tmp;
1494          } else {
1495             bld.ADD(indirect, indirect, tmp);
1496          }
1497       }
1498    }
1499
1500    if (indirect.file == BAD_FILE) {
1501       return image;
1502    } else {
1503       /* Emit a pile of MOVs to load the uniform into a temporary.  The
1504        * dead-code elimination pass will get rid of what we don't use.
1505        */
1506       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
1507       for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
1508          bld.emit(SHADER_OPCODE_MOV_INDIRECT,
1509                   offset(tmp, bld, j), offset(image, bld, j),
1510                   indirect, brw_imm_ud((indirect_max + 1) * 4));
1511       }
1512       return tmp;
1513    }
1514 }
1515
1516 void
1517 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1518                          unsigned wr_mask)
1519 {
1520    for (unsigned i = 0; i < 4; i++) {
1521       if (!((wr_mask >> i) & 1))
1522          continue;
1523
1524       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1525       new_inst->dst = offset(new_inst->dst, bld, i);
1526       for (unsigned j = 0; j < new_inst->sources; j++)
1527          if (new_inst->src[j].file == VGRF)
1528             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1529
1530       bld.emit(new_inst);
1531    }
1532 }
1533
1534 /**
1535  * Get the matching channel register datatype for an image intrinsic of the
1536  * specified GLSL image type.
1537  */
1538 static brw_reg_type
1539 get_image_base_type(const glsl_type *type)
1540 {
1541    switch ((glsl_base_type)type->sampled_type) {
1542    case GLSL_TYPE_UINT:
1543       return BRW_REGISTER_TYPE_UD;
1544    case GLSL_TYPE_INT:
1545       return BRW_REGISTER_TYPE_D;
1546    case GLSL_TYPE_FLOAT:
1547       return BRW_REGISTER_TYPE_F;
1548    default:
1549       unreachable("Not reached.");
1550    }
1551 }
1552
1553 /**
1554  * Get the appropriate atomic op for an image atomic intrinsic.
1555  */
1556 static unsigned
1557 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
1558 {
1559    switch (op) {
1560    case nir_intrinsic_image_atomic_add:
1561       return BRW_AOP_ADD;
1562    case nir_intrinsic_image_atomic_min:
1563       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1564               BRW_AOP_IMIN : BRW_AOP_UMIN);
1565    case nir_intrinsic_image_atomic_max:
1566       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1567               BRW_AOP_IMAX : BRW_AOP_UMAX);
1568    case nir_intrinsic_image_atomic_and:
1569       return BRW_AOP_AND;
1570    case nir_intrinsic_image_atomic_or:
1571       return BRW_AOP_OR;
1572    case nir_intrinsic_image_atomic_xor:
1573       return BRW_AOP_XOR;
1574    case nir_intrinsic_image_atomic_exchange:
1575       return BRW_AOP_MOV;
1576    case nir_intrinsic_image_atomic_comp_swap:
1577       return BRW_AOP_CMPWR;
1578    default:
1579       unreachable("Not reachable.");
1580    }
1581 }
1582
1583 static fs_inst *
1584 emit_pixel_interpolater_send(const fs_builder &bld,
1585                              enum opcode opcode,
1586                              const fs_reg &dst,
1587                              const fs_reg &src,
1588                              const fs_reg &desc,
1589                              glsl_interp_mode interpolation)
1590 {
1591    struct brw_wm_prog_data *wm_prog_data =
1592       brw_wm_prog_data(bld.shader->stage_prog_data);
1593    fs_inst *inst;
1594    fs_reg payload;
1595    int mlen;
1596
1597    if (src.file == BAD_FILE) {
1598       /* Dummy payload */
1599       payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
1600       mlen = 1;
1601    } else {
1602       payload = src;
1603       mlen = 2 * bld.dispatch_width() / 8;
1604    }
1605
1606    inst = bld.emit(opcode, dst, payload, desc);
1607    inst->mlen = mlen;
1608    /* 2 floats per slot returned */
1609    inst->size_written = 2 * dst.component_size(inst->exec_size);
1610    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1611
1612    wm_prog_data->pulls_bary = true;
1613
1614    return inst;
1615 }
1616
1617 /**
1618  * Computes 1 << x, given a D/UD register containing some value x.
1619  */
1620 static fs_reg
1621 intexp2(const fs_builder &bld, const fs_reg &x)
1622 {
1623    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1624
1625    fs_reg result = bld.vgrf(x.type, 1);
1626    fs_reg one = bld.vgrf(x.type, 1);
1627
1628    bld.MOV(one, retype(brw_imm_d(1), one.type));
1629    bld.SHL(result, one, x);
1630    return result;
1631 }
1632
1633 void
1634 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1635 {
1636    assert(stage == MESA_SHADER_GEOMETRY);
1637
1638    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1639
1640    if (gs_compile->control_data_header_size_bits == 0)
1641       return;
1642
1643    /* We can only do EndPrimitive() functionality when the control data
1644     * consists of cut bits.  Fortunately, the only time it isn't is when the
1645     * output type is points, in which case EndPrimitive() is a no-op.
1646     */
1647    if (gs_prog_data->control_data_format !=
1648        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1649       return;
1650    }
1651
1652    /* Cut bits use one bit per vertex. */
1653    assert(gs_compile->control_data_bits_per_vertex == 1);
1654
1655    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1656    vertex_count.type = BRW_REGISTER_TYPE_UD;
1657
1658    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1659     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1660     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1661     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1662     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1663     *
1664     * Note that if EndPrimitive() is called before emitting any vertices, this
1665     * will cause us to set bit 31 of the control_data_bits register to 1.
1666     * That's fine because:
1667     *
1668     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1669     *   output, so the hardware will ignore cut bit 31.
1670     *
1671     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1672     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1673     *   is automatically ended when the GS terminates).
1674     *
1675     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1676     *   control_data_bits register to 0 when the first vertex is emitted.
1677     */
1678
1679    const fs_builder abld = bld.annotate("end primitive");
1680
1681    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1682    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1683    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1684    fs_reg mask = intexp2(abld, prev_count);
1685    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1686     * attention to the lower 5 bits of its second source argument, so on this
1687     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1688     * ((vertex_count - 1) % 32).
1689     */
1690    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1691 }
1692
1693 void
1694 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1695 {
1696    assert(stage == MESA_SHADER_GEOMETRY);
1697    assert(gs_compile->control_data_bits_per_vertex != 0);
1698
1699    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1700
1701    const fs_builder abld = bld.annotate("emit control data bits");
1702    const fs_builder fwa_bld = bld.exec_all();
1703
1704    /* We use a single UD register to accumulate control data bits (32 bits
1705     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1706     * at a time.
1707     *
1708     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1709     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1710     * use the Channel Mask phase to enable/disable which DWord within that
1711     * group to write.  (Remember, different SIMD8 channels may have emitted
1712     * different numbers of vertices, so we may need per-slot offsets.)
1713     *
1714     * Channel masking presents an annoying problem: we may have to replicate
1715     * the data up to 4 times:
1716     *
1717     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1718     *
1719     * To avoid penalizing shaders that emit a small number of vertices, we
1720     * can avoid these sometimes: if the size of the control data header is
1721     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1722     * land in the same 128-bit group, so we can skip per-slot offsets.
1723     *
1724     * Similarly, if the control data header is <= 32 bits, there is only one
1725     * DWord, so we can skip channel masks.
1726     */
1727    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1728
1729    fs_reg channel_mask, per_slot_offset;
1730
1731    if (gs_compile->control_data_header_size_bits > 32) {
1732       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1733       channel_mask = vgrf(glsl_type::uint_type);
1734    }
1735
1736    if (gs_compile->control_data_header_size_bits > 128) {
1737       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1738       per_slot_offset = vgrf(glsl_type::uint_type);
1739    }
1740
1741    /* Figure out which DWord we're trying to write to using the formula:
1742     *
1743     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1744     *
1745     * Since bits_per_vertex is a power of two, and is known at compile
1746     * time, this can be optimized to:
1747     *
1748     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1749     */
1750    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1751       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1752       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1753       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1754       unsigned log2_bits_per_vertex =
1755          util_last_bit(gs_compile->control_data_bits_per_vertex);
1756       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1757
1758       if (per_slot_offset.file != BAD_FILE) {
1759          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1760           * the appropriate OWord within the control data header.
1761           */
1762          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1763       }
1764
1765       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1766        * write to the appropriate DWORD within the OWORD.
1767        */
1768       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1769       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1770       channel_mask = intexp2(fwa_bld, channel);
1771       /* Then the channel masks need to be in bits 23:16. */
1772       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1773    }
1774
1775    /* Store the control data bits in the message payload and send it. */
1776    int mlen = 2;
1777    if (channel_mask.file != BAD_FILE)
1778       mlen += 4; /* channel masks, plus 3 extra copies of the data */
1779    if (per_slot_offset.file != BAD_FILE)
1780       mlen++;
1781
1782    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1783    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1784    int i = 0;
1785    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1786    if (per_slot_offset.file != BAD_FILE)
1787       sources[i++] = per_slot_offset;
1788    if (channel_mask.file != BAD_FILE)
1789       sources[i++] = channel_mask;
1790    while (i < mlen) {
1791       sources[i++] = this->control_data_bits;
1792    }
1793
1794    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1795    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1796    inst->mlen = mlen;
1797    /* We need to increment Global Offset by 256-bits to make room for
1798     * Broadwell's extra "Vertex Count" payload at the beginning of the
1799     * URB entry.  Since this is an OWord message, Global Offset is counted
1800     * in 128-bit units, so we must set it to 2.
1801     */
1802    if (gs_prog_data->static_vertex_count == -1)
1803       inst->offset = 2;
1804 }
1805
1806 void
1807 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1808                                             unsigned stream_id)
1809 {
1810    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1811
1812    /* Note: we are calling this *before* increasing vertex_count, so
1813     * this->vertex_count == vertex_count - 1 in the formula above.
1814     */
1815
1816    /* Stream mode uses 2 bits per vertex */
1817    assert(gs_compile->control_data_bits_per_vertex == 2);
1818
1819    /* Must be a valid stream */
1820    assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
1821
1822    /* Control data bits are initialized to 0 so we don't have to set any
1823     * bits when sending vertices to stream 0.
1824     */
1825    if (stream_id == 0)
1826       return;
1827
1828    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1829
1830    /* reg::sid = stream_id */
1831    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1832    abld.MOV(sid, brw_imm_ud(stream_id));
1833
1834    /* reg:shift_count = 2 * (vertex_count - 1) */
1835    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1836    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
1837
1838    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1839     * attention to the lower 5 bits of its second source argument, so on this
1840     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1841     * stream_id << ((2 * (vertex_count - 1)) % 32).
1842     */
1843    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1844    abld.SHL(mask, sid, shift_count);
1845    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1846 }
1847
1848 void
1849 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1850                            unsigned stream_id)
1851 {
1852    assert(stage == MESA_SHADER_GEOMETRY);
1853
1854    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1855
1856    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1857    vertex_count.type = BRW_REGISTER_TYPE_UD;
1858
1859    /* Haswell and later hardware ignores the "Render Stream Select" bits
1860     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1861     * and instead sends all primitives down the pipeline for rasterization.
1862     * If the SOL stage is enabled, "Render Stream Select" is honored and
1863     * primitives bound to non-zero streams are discarded after stream output.
1864     *
1865     * Since the only purpose of primives sent to non-zero streams is to
1866     * be recorded by transform feedback, we can simply discard all geometry
1867     * bound to these streams when transform feedback is disabled.
1868     */
1869    if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
1870       return;
1871
1872    /* If we're outputting 32 control data bits or less, then we can wait
1873     * until the shader is over to output them all.  Otherwise we need to
1874     * output them as we go.  Now is the time to do it, since we're about to
1875     * output the vertex_count'th vertex, so it's guaranteed that the
1876     * control data bits associated with the (vertex_count - 1)th vertex are
1877     * correct.
1878     */
1879    if (gs_compile->control_data_header_size_bits > 32) {
1880       const fs_builder abld =
1881          bld.annotate("emit vertex: emit control data bits");
1882
1883       /* Only emit control data bits if we've finished accumulating a batch
1884        * of 32 bits.  This is the case when:
1885        *
1886        *     (vertex_count * bits_per_vertex) % 32 == 0
1887        *
1888        * (in other words, when the last 5 bits of vertex_count *
1889        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
1890        * integer n (which is always the case, since bits_per_vertex is
1891        * always 1 or 2), this is equivalent to requiring that the last 5-n
1892        * bits of vertex_count are 0:
1893        *
1894        *     vertex_count & (2^(5-n) - 1) == 0
1895        *
1896        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
1897        * equivalent to:
1898        *
1899        *     vertex_count & (32 / bits_per_vertex - 1) == 0
1900        *
1901        * TODO: If vertex_count is an immediate, we could do some of this math
1902        *       at compile time...
1903        */
1904       fs_inst *inst =
1905          abld.AND(bld.null_reg_d(), vertex_count,
1906                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
1907       inst->conditional_mod = BRW_CONDITIONAL_Z;
1908
1909       abld.IF(BRW_PREDICATE_NORMAL);
1910       /* If vertex_count is 0, then no control data bits have been
1911        * accumulated yet, so we can skip emitting them.
1912        */
1913       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
1914                BRW_CONDITIONAL_NEQ);
1915       abld.IF(BRW_PREDICATE_NORMAL);
1916       emit_gs_control_data_bits(vertex_count);
1917       abld.emit(BRW_OPCODE_ENDIF);
1918
1919       /* Reset control_data_bits to 0 so we can start accumulating a new
1920        * batch.
1921        *
1922        * Note: in the case where vertex_count == 0, this neutralizes the
1923        * effect of any call to EndPrimitive() that the shader may have
1924        * made before outputting its first vertex.
1925        */
1926       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
1927       inst->force_writemask_all = true;
1928       abld.emit(BRW_OPCODE_ENDIF);
1929    }
1930
1931    emit_urb_writes(vertex_count);
1932
1933    /* In stream mode we have to set control data bits for all vertices
1934     * unless we have disabled control data bits completely (which we do
1935     * do for GL_POINTS outputs that don't use streams).
1936     */
1937    if (gs_compile->control_data_header_size_bits > 0 &&
1938        gs_prog_data->control_data_format ==
1939           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
1940       set_gs_stream_control_data_bits(vertex_count, stream_id);
1941    }
1942 }
1943
1944 void
1945 fs_visitor::emit_gs_input_load(const fs_reg &dst,
1946                                const nir_src &vertex_src,
1947                                unsigned base_offset,
1948                                const nir_src &offset_src,
1949                                unsigned num_components,
1950                                unsigned first_component)
1951 {
1952    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1953
1954    nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
1955    nir_const_value *offset_const = nir_src_as_const_value(offset_src);
1956    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
1957
1958    /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
1959     * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
1960     * gl_PointSize is available as a GS input, however, so it must be that.
1961     */
1962    const bool is_point_size = (base_offset == 0);
1963
1964    /* TODO: figure out push input layout for invocations == 1 */
1965    if (gs_prog_data->invocations == 1 &&
1966        offset_const != NULL && vertex_const != NULL &&
1967        4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
1968       int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
1969                        vertex_const->u32[0] * push_reg_count;
1970       /* This input was pushed into registers. */
1971       if (is_point_size) {
1972          /* gl_PointSize comes in .w */
1973          bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
1974       } else {
1975          for (unsigned i = 0; i < num_components; i++) {
1976             bld.MOV(offset(dst, bld, i),
1977                     fs_reg(ATTR, imm_offset + i + first_component, dst.type));
1978          }
1979       }
1980       return;
1981    }
1982
1983    /* Resort to the pull model.  Ensure the VUE handles are provided. */
1984    gs_prog_data->base.include_vue_handles = true;
1985
1986    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
1987    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1988
1989    if (gs_prog_data->invocations == 1) {
1990       if (vertex_const) {
1991          /* The vertex index is constant; just select the proper URB handle. */
1992          icp_handle =
1993             retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
1994                    BRW_REGISTER_TYPE_UD);
1995       } else {
1996          /* The vertex index is non-constant.  We need to use indirect
1997           * addressing to fetch the proper URB handle.
1998           *
1999           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2000           * indicating that channel <n> should read the handle from
2001           * DWord <n>.  We convert that to bytes by multiplying by 4.
2002           *
2003           * Next, we convert the vertex index to bytes by multiplying
2004           * by 32 (shifting by 5), and add the two together.  This is
2005           * the final indirect byte offset.
2006           */
2007          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
2008          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2009          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2010          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2011
2012          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2013          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2014          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2015          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2016          /* Convert vertex_index to bytes (multiply by 32) */
2017          bld.SHL(vertex_offset_bytes,
2018                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2019                  brw_imm_ud(5u));
2020          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2021
2022          /* Use first_icp_handle as the base offset.  There is one register
2023           * of URB handles per vertex, so inform the register allocator that
2024           * we might read up to nir->info->gs.vertices_in registers.
2025           */
2026          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2027                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2028                   fs_reg(icp_offset_bytes),
2029                   brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
2030       }
2031    } else {
2032       assert(gs_prog_data->invocations > 1);
2033
2034       if (vertex_const) {
2035          assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
2036          bld.MOV(icp_handle,
2037                  retype(brw_vec1_grf(first_icp_handle +
2038                                      vertex_const->i32[0] / 8,
2039                                      vertex_const->i32[0] % 8),
2040                         BRW_REGISTER_TYPE_UD));
2041       } else {
2042          /* The vertex index is non-constant.  We need to use indirect
2043           * addressing to fetch the proper URB handle.
2044           *
2045           */
2046          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2047
2048          /* Convert vertex_index to bytes (multiply by 4) */
2049          bld.SHL(icp_offset_bytes,
2050                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2051                  brw_imm_ud(2u));
2052
2053          /* Use first_icp_handle as the base offset.  There is one DWord
2054           * of URB handles per vertex, so inform the register allocator that
2055           * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
2056           */
2057          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2058                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2059                   fs_reg(icp_offset_bytes),
2060                   brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
2061                              REG_SIZE));
2062       }
2063    }
2064
2065    fs_inst *inst;
2066
2067    fs_reg tmp_dst = dst;
2068    fs_reg indirect_offset = get_nir_src(offset_src);
2069    unsigned num_iterations = 1;
2070    unsigned orig_num_components = num_components;
2071
2072    if (type_sz(dst.type) == 8) {
2073       if (num_components > 2) {
2074          num_iterations = 2;
2075          num_components = 2;
2076       }
2077       fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2078       tmp_dst = tmp;
2079       first_component = first_component / 2;
2080    }
2081
2082    for (unsigned iter = 0; iter < num_iterations; iter++) {
2083       if (offset_const) {
2084          /* Constant indexing - use global offset. */
2085          if (first_component != 0) {
2086             unsigned read_components = num_components + first_component;
2087             fs_reg tmp = bld.vgrf(dst.type, read_components);
2088             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2089             inst->size_written = read_components *
2090                                  tmp.component_size(inst->exec_size);
2091             for (unsigned i = 0; i < num_components; i++) {
2092                bld.MOV(offset(tmp_dst, bld, i),
2093                        offset(tmp, bld, i + first_component));
2094             }
2095          } else {
2096             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2097                             icp_handle);
2098             inst->size_written = num_components *
2099                                  tmp_dst.component_size(inst->exec_size);
2100          }
2101          inst->offset = base_offset + offset_const->u32[0];
2102          inst->mlen = 1;
2103       } else {
2104          /* Indirect indexing - use per-slot offsets as well. */
2105          const fs_reg srcs[] = { icp_handle, indirect_offset };
2106          unsigned read_components = num_components + first_component;
2107          fs_reg tmp = bld.vgrf(dst.type, read_components);
2108          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2109          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2110          if (first_component != 0) {
2111             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2112                             payload);
2113             inst->size_written = read_components *
2114                                  tmp.component_size(inst->exec_size);
2115             for (unsigned i = 0; i < num_components; i++) {
2116                bld.MOV(offset(tmp_dst, bld, i),
2117                        offset(tmp, bld, i + first_component));
2118             }
2119          } else {
2120             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2121                          payload);
2122             inst->size_written = num_components *
2123                                  tmp_dst.component_size(inst->exec_size);
2124          }
2125          inst->offset = base_offset;
2126          inst->mlen = 2;
2127       }
2128
2129       if (type_sz(dst.type) == 8) {
2130          shuffle_32bit_load_result_to_64bit_data(
2131             bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
2132
2133          for (unsigned c = 0; c < num_components; c++)
2134             bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
2135       }
2136
2137       if (num_iterations > 1) {
2138          num_components = orig_num_components - 2;
2139          if(offset_const) {
2140             base_offset++;
2141          } else {
2142             fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2143             bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2144             indirect_offset = new_indirect;
2145          }
2146       }
2147    }
2148
2149    if (is_point_size) {
2150       /* Read the whole VUE header (because of alignment) and read .w. */
2151       fs_reg tmp = bld.vgrf(dst.type, 4);
2152       inst->dst = tmp;
2153       inst->size_written = 4 * REG_SIZE;
2154       bld.MOV(dst, offset(tmp, bld, 3));
2155    }
2156 }
2157
2158 fs_reg
2159 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2160 {
2161    nir_src *offset_src = nir_get_io_offset_src(instr);
2162    nir_const_value *const_value = nir_src_as_const_value(*offset_src);
2163
2164    if (const_value) {
2165       /* The only constant offset we should find is 0.  brw_nir.c's
2166        * add_const_offset_to_base() will fold other constant offsets
2167        * into instr->const_index[0].
2168        */
2169       assert(const_value->u32[0] == 0);
2170       return fs_reg();
2171    }
2172
2173    return get_nir_src(*offset_src);
2174 }
2175
2176 static void
2177 do_untyped_vector_read(const fs_builder &bld,
2178                        const fs_reg dest,
2179                        const fs_reg surf_index,
2180                        const fs_reg offset_reg,
2181                        unsigned num_components)
2182 {
2183    if (type_sz(dest.type) == 4) {
2184       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
2185                                              1 /* dims */,
2186                                              num_components,
2187                                              BRW_PREDICATE_NONE);
2188       read_result.type = dest.type;
2189       for (unsigned i = 0; i < num_components; i++)
2190          bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
2191    } else if (type_sz(dest.type) == 8) {
2192       /* Reading a dvec, so we need to:
2193        *
2194        * 1. Multiply num_components by 2, to account for the fact that we
2195        *    need to read 64-bit components.
2196        * 2. Shuffle the result of the load to form valid 64-bit elements
2197        * 3. Emit a second load (for components z/w) if needed.
2198        */
2199       fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
2200       bld.MOV(read_offset, offset_reg);
2201
2202       int iters = num_components <= 2 ? 1 : 2;
2203
2204       /* Load the dvec, the first iteration loads components x/y, the second
2205        * iteration, if needed, loads components z/w
2206        */
2207       for (int it = 0; it < iters; it++) {
2208          /* Compute number of components to read in this iteration */
2209          int iter_components = MIN2(2, num_components);
2210          num_components -= iter_components;
2211
2212          /* Read. Since this message reads 32-bit components, we need to
2213           * read twice as many components.
2214           */
2215          fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
2216                                                 1 /* dims */,
2217                                                 iter_components * 2,
2218                                                 BRW_PREDICATE_NONE);
2219
2220          /* Shuffle the 32-bit load result into valid 64-bit data */
2221          const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
2222          shuffle_32bit_load_result_to_64bit_data(
2223             bld, packed_result, read_result, iter_components);
2224
2225          /* Move each component to its destination */
2226          read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
2227          for (int c = 0; c < iter_components; c++) {
2228             bld.MOV(offset(dest, bld, it * 2 + c),
2229                     offset(packed_result, bld, c));
2230          }
2231
2232          bld.ADD(read_offset, read_offset, brw_imm_ud(16));
2233       }
2234    } else {
2235       unreachable("Unsupported type");
2236    }
2237 }
2238
2239 void
2240 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2241                                   nir_intrinsic_instr *instr)
2242 {
2243    assert(stage == MESA_SHADER_VERTEX);
2244
2245    fs_reg dest;
2246    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2247       dest = get_nir_dest(instr->dest);
2248
2249    switch (instr->intrinsic) {
2250    case nir_intrinsic_load_vertex_id:
2251       unreachable("should be lowered by lower_vertex_id()");
2252
2253    case nir_intrinsic_load_vertex_id_zero_base:
2254    case nir_intrinsic_load_base_vertex:
2255    case nir_intrinsic_load_instance_id:
2256    case nir_intrinsic_load_base_instance:
2257    case nir_intrinsic_load_draw_id: {
2258       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
2259       fs_reg val = nir_system_values[sv];
2260       assert(val.file != BAD_FILE);
2261       dest.type = val.type;
2262       bld.MOV(dest, val);
2263       break;
2264    }
2265
2266    case nir_intrinsic_load_input: {
2267       fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
2268       unsigned first_component = nir_intrinsic_component(instr);
2269       unsigned num_components = instr->num_components;
2270       enum brw_reg_type type = dest.type;
2271
2272       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
2273       assert(const_offset && "Indirect input loads not allowed");
2274       src = offset(src, bld, const_offset->u32[0]);
2275
2276       for (unsigned j = 0; j < num_components; j++) {
2277          bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
2278       }
2279
2280       if (type == BRW_REGISTER_TYPE_DF) {
2281          /* Once the double vector is read, set again its original register
2282           * type to continue with normal execution.
2283           */
2284          src = retype(src, type);
2285          dest = retype(dest, type);
2286       }
2287
2288       if (type_sz(src.type) == 8) {
2289          shuffle_32bit_load_result_to_64bit_data(bld,
2290                                                  dest,
2291                                                  retype(dest, BRW_REGISTER_TYPE_F),
2292                                                  instr->num_components);
2293       }
2294       break;
2295    }
2296
2297    default:
2298       nir_emit_intrinsic(bld, instr);
2299       break;
2300    }
2301 }
2302
2303 void
2304 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2305                                    nir_intrinsic_instr *instr)
2306 {
2307    assert(stage == MESA_SHADER_TESS_CTRL);
2308    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2309    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2310
2311    fs_reg dst;
2312    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2313       dst = get_nir_dest(instr->dest);
2314
2315    switch (instr->intrinsic) {
2316    case nir_intrinsic_load_primitive_id:
2317       bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2318       break;
2319    case nir_intrinsic_load_invocation_id:
2320       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2321       break;
2322    case nir_intrinsic_load_patch_vertices_in:
2323       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2324               brw_imm_d(tcs_key->input_vertices));
2325       break;
2326
2327    case nir_intrinsic_barrier: {
2328       if (tcs_prog_data->instances == 1)
2329          break;
2330
2331       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2332       fs_reg m0_2 = component(m0, 2);
2333
2334       const fs_builder chanbld = bld.exec_all().group(1, 0);
2335
2336       /* Zero the message header */
2337       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2338
2339       /* Copy "Barrier ID" from r0.2, bits 16:13 */
2340       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2341                   brw_imm_ud(INTEL_MASK(16, 13)));
2342
2343       /* Shift it up to bits 27:24. */
2344       chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2345
2346       /* Set the Barrier Count and the enable bit */
2347       chanbld.OR(m0_2, m0_2,
2348                  brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2349
2350       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2351       break;
2352    }
2353
2354    case nir_intrinsic_load_input:
2355       unreachable("nir_lower_io should never give us these.");
2356       break;
2357
2358    case nir_intrinsic_load_per_vertex_input: {
2359       fs_reg indirect_offset = get_indirect_offset(instr);
2360       unsigned imm_offset = instr->const_index[0];
2361
2362       const nir_src &vertex_src = instr->src[0];
2363       nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
2364
2365       fs_inst *inst;
2366
2367       fs_reg icp_handle;
2368
2369       if (vertex_const) {
2370          /* Emit a MOV to resolve <0,1,0> regioning. */
2371          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2372          bld.MOV(icp_handle,
2373                  retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
2374                                      vertex_const->i32[0] & 7),
2375                         BRW_REGISTER_TYPE_UD));
2376       } else if (tcs_prog_data->instances == 1 &&
2377                  vertex_src.is_ssa &&
2378                  vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2379                  nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2380          /* For the common case of only 1 instance, an array index of
2381           * gl_InvocationID means reading g1.  Skip all the indirect work.
2382           */
2383          icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2384       } else {
2385          /* The vertex index is non-constant.  We need to use indirect
2386           * addressing to fetch the proper URB handle.
2387           */
2388          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2389
2390          /* Each ICP handle is a single DWord (4 bytes) */
2391          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2392          bld.SHL(vertex_offset_bytes,
2393                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2394                  brw_imm_ud(2u));
2395
2396          /* Start at g1.  We might read up to 4 registers. */
2397          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2398                   retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2399                   brw_imm_ud(4 * REG_SIZE));
2400       }
2401
2402       /* We can only read two double components with each URB read, so
2403        * we send two read messages in that case, each one loading up to
2404        * two double components.
2405        */
2406       unsigned num_iterations = 1;
2407       unsigned num_components = instr->num_components;
2408       unsigned first_component = nir_intrinsic_component(instr);
2409       fs_reg orig_dst = dst;
2410       if (type_sz(dst.type) == 8) {
2411          first_component = first_component / 2;
2412          if (instr->num_components > 2) {
2413             num_iterations = 2;
2414             num_components = 2;
2415          }
2416
2417          fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2418          dst = tmp;
2419       }
2420
2421       for (unsigned iter = 0; iter < num_iterations; iter++) {
2422          if (indirect_offset.file == BAD_FILE) {
2423             /* Constant indexing - use global offset. */
2424             if (first_component != 0) {
2425                unsigned read_components = num_components + first_component;
2426                fs_reg tmp = bld.vgrf(dst.type, read_components);
2427                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2428                for (unsigned i = 0; i < num_components; i++) {
2429                   bld.MOV(offset(dst, bld, i),
2430                           offset(tmp, bld, i + first_component));
2431                }
2432             } else {
2433                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2434             }
2435             inst->offset = imm_offset;
2436             inst->mlen = 1;
2437          } else {
2438             /* Indirect indexing - use per-slot offsets as well. */
2439             const fs_reg srcs[] = { icp_handle, indirect_offset };
2440             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2441             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2442             if (first_component != 0) {
2443                unsigned read_components = num_components + first_component;
2444                fs_reg tmp = bld.vgrf(dst.type, read_components);
2445                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2446                                payload);
2447                for (unsigned i = 0; i < num_components; i++) {
2448                   bld.MOV(offset(dst, bld, i),
2449                           offset(tmp, bld, i + first_component));
2450                }
2451             } else {
2452                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2453                                payload);
2454             }
2455             inst->offset = imm_offset;
2456             inst->mlen = 2;
2457          }
2458          inst->size_written = (num_components + first_component) *
2459                               inst->dst.component_size(inst->exec_size);
2460
2461          /* If we are reading 64-bit data using 32-bit read messages we need
2462           * build proper 64-bit data elements by shuffling the low and high
2463           * 32-bit components around like we do for other things like UBOs
2464           * or SSBOs.
2465           */
2466          if (type_sz(dst.type) == 8) {
2467             shuffle_32bit_load_result_to_64bit_data(
2468                bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
2469
2470             for (unsigned c = 0; c < num_components; c++) {
2471                bld.MOV(offset(orig_dst, bld, iter * 2 + c),
2472                        offset(dst, bld, c));
2473             }
2474          }
2475
2476          /* Copy the temporary to the destination to deal with writemasking.
2477           *
2478           * Also attempt to deal with gl_PointSize being in the .w component.
2479           */
2480          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2481             assert(type_sz(dst.type) < 8);
2482             inst->dst = bld.vgrf(dst.type, 4);
2483             inst->size_written = 4 * REG_SIZE;
2484             bld.MOV(dst, offset(inst->dst, bld, 3));
2485          }
2486
2487          /* If we are loading double data and we need a second read message
2488           * adjust the write offset
2489           */
2490          if (num_iterations > 1) {
2491             num_components = instr->num_components - 2;
2492             imm_offset++;
2493          }
2494       }
2495       break;
2496    }
2497
2498    case nir_intrinsic_load_output:
2499    case nir_intrinsic_load_per_vertex_output: {
2500       fs_reg indirect_offset = get_indirect_offset(instr);
2501       unsigned imm_offset = instr->const_index[0];
2502       unsigned first_component = nir_intrinsic_component(instr);
2503
2504       fs_inst *inst;
2505       if (indirect_offset.file == BAD_FILE) {
2506          /* Replicate the patch handle to all enabled channels */
2507          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2508          bld.MOV(patch_handle,
2509                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2510
2511          {
2512             if (first_component != 0) {
2513                unsigned read_components =
2514                   instr->num_components + first_component;
2515                fs_reg tmp = bld.vgrf(dst.type, read_components);
2516                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2517                                patch_handle);
2518                inst->size_written = read_components * REG_SIZE;
2519                for (unsigned i = 0; i < instr->num_components; i++) {
2520                   bld.MOV(offset(dst, bld, i),
2521                           offset(tmp, bld, i + first_component));
2522                }
2523             } else {
2524                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2525                                patch_handle);
2526                inst->size_written = instr->num_components * REG_SIZE;
2527             }
2528             inst->offset = imm_offset;
2529             inst->mlen = 1;
2530          }
2531       } else {
2532          /* Indirect indexing - use per-slot offsets as well. */
2533          const fs_reg srcs[] = {
2534             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2535             indirect_offset
2536          };
2537          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2538          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2539          if (first_component != 0) {
2540             unsigned read_components =
2541                instr->num_components + first_component;
2542             fs_reg tmp = bld.vgrf(dst.type, read_components);
2543             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2544                             payload);
2545             inst->size_written = read_components * REG_SIZE;
2546             for (unsigned i = 0; i < instr->num_components; i++) {
2547                bld.MOV(offset(dst, bld, i),
2548                        offset(tmp, bld, i + first_component));
2549             }
2550          } else {
2551             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2552                             payload);
2553             inst->size_written = instr->num_components * REG_SIZE;
2554          }
2555          inst->offset = imm_offset;
2556          inst->mlen = 2;
2557       }
2558       break;
2559    }
2560
2561    case nir_intrinsic_store_output:
2562    case nir_intrinsic_store_per_vertex_output: {
2563       fs_reg value = get_nir_src(instr->src[0]);
2564       bool is_64bit = (instr->src[0].is_ssa ?
2565          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2566       fs_reg indirect_offset = get_indirect_offset(instr);
2567       unsigned imm_offset = instr->const_index[0];
2568       unsigned swiz = BRW_SWIZZLE_XYZW;
2569       unsigned mask = instr->const_index[1];
2570       unsigned header_regs = 0;
2571       fs_reg srcs[7];
2572       srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2573
2574       if (indirect_offset.file != BAD_FILE) {
2575          srcs[header_regs++] = indirect_offset;
2576       }
2577
2578       if (mask == 0)
2579          break;
2580
2581       unsigned num_components = util_last_bit(mask);
2582       enum opcode opcode;
2583
2584       /* We can only pack two 64-bit components in a single message, so send
2585        * 2 messages if we have more components
2586        */
2587       unsigned num_iterations = 1;
2588       unsigned iter_components = num_components;
2589       unsigned first_component = nir_intrinsic_component(instr);
2590       if (is_64bit) {
2591          first_component = first_component / 2;
2592          if (instr->num_components > 2) {
2593             num_iterations = 2;
2594             iter_components = 2;
2595          }
2596       }
2597
2598       /* 64-bit data needs to me shuffled before we can write it to the URB.
2599        * We will use this temporary to shuffle the components in each
2600        * iteration.
2601        */
2602       fs_reg tmp =
2603          fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
2604
2605       mask = mask << first_component;
2606
2607       for (unsigned iter = 0; iter < num_iterations; iter++) {
2608          if (!is_64bit && mask != WRITEMASK_XYZW) {
2609             srcs[header_regs++] = brw_imm_ud(mask << 16);
2610             opcode = indirect_offset.file != BAD_FILE ?
2611                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2612                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2613          } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2614             /* Expand the 64-bit mask to 32-bit channels. We only handle
2615              * two channels in each iteration, so we only care about X/Y.
2616              */
2617             unsigned mask32 = 0;
2618             if (mask & WRITEMASK_X)
2619                mask32 |= WRITEMASK_XY;
2620             if (mask & WRITEMASK_Y)
2621                mask32 |= WRITEMASK_ZW;
2622
2623             /* If the mask does not include any of the channels X or Y there
2624              * is nothing to do in this iteration. Move on to the next couple
2625              * of 64-bit channels.
2626              */
2627             if (!mask32) {
2628                mask >>= 2;
2629                imm_offset++;
2630                continue;
2631             }
2632
2633             srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2634             opcode = indirect_offset.file != BAD_FILE ?
2635                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2636                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2637          } else {
2638             opcode = indirect_offset.file != BAD_FILE ?
2639                SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2640                SHADER_OPCODE_URB_WRITE_SIMD8;
2641          }
2642
2643          for (unsigned i = 0; i < iter_components; i++) {
2644             if (!(mask & (1 << (i + first_component))))
2645                continue;
2646
2647             if (!is_64bit) {
2648                srcs[header_regs + i + first_component] =
2649                   offset(value, bld, BRW_GET_SWZ(swiz, i));
2650             } else {
2651                /* We need to shuffle the 64-bit data to match the layout
2652                 * expected by our 32-bit URB write messages. We use a temporary
2653                 * for that.
2654                 */
2655                unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
2656                shuffle_64bit_data_for_32bit_write(bld,
2657                   retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
2658                   retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
2659                   1);
2660
2661                /* Now copy the data to the destination */
2662                fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
2663                unsigned idx = 2 * i;
2664                bld.MOV(dest, offset(tmp, bld, idx));
2665                bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
2666                srcs[header_regs + idx + first_component * 2] = dest;
2667                srcs[header_regs + idx + 1 + first_component * 2] =
2668                   offset(dest, bld, 1);
2669             }
2670          }
2671
2672          unsigned mlen =
2673             header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2674             (is_64bit ? 2 * first_component : first_component);
2675          fs_reg payload =
2676             bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2677          bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2678
2679          fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2680          inst->offset = imm_offset;
2681          inst->mlen = mlen;
2682
2683          /* If this is a 64-bit attribute, select the next two 64-bit channels
2684           * to be handled in the next iteration.
2685           */
2686          if (is_64bit) {
2687             mask >>= 2;
2688             imm_offset++;
2689          }
2690       }
2691       break;
2692    }
2693
2694    default:
2695       nir_emit_intrinsic(bld, instr);
2696       break;
2697    }
2698 }
2699
2700 void
2701 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2702                                    nir_intrinsic_instr *instr)
2703 {
2704    assert(stage == MESA_SHADER_TESS_EVAL);
2705    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2706
2707    fs_reg dest;
2708    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2709       dest = get_nir_dest(instr->dest);
2710
2711    switch (instr->intrinsic) {
2712    case nir_intrinsic_load_primitive_id:
2713       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2714       break;
2715    case nir_intrinsic_load_tess_coord:
2716       /* gl_TessCoord is part of the payload in g1-3 */
2717       for (unsigned i = 0; i < 3; i++) {
2718          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2719       }
2720       break;
2721
2722    case nir_intrinsic_load_input:
2723    case nir_intrinsic_load_per_vertex_input: {
2724       fs_reg indirect_offset = get_indirect_offset(instr);
2725       unsigned imm_offset = instr->const_index[0];
2726       unsigned first_component = nir_intrinsic_component(instr);
2727
2728       if (type_sz(dest.type) == 8) {
2729          first_component = first_component / 2;
2730       }
2731
2732       fs_inst *inst;
2733       if (indirect_offset.file == BAD_FILE) {
2734          /* Arbitrarily only push up to 32 vec4 slots worth of data,
2735           * which is 16 registers (since each holds 2 vec4 slots).
2736           */
2737          const unsigned max_push_slots = 32;
2738          if (imm_offset < max_push_slots) {
2739             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2740             for (int i = 0; i < instr->num_components; i++) {
2741                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2742                   i + first_component;
2743                bld.MOV(offset(dest, bld, i), component(src, comp));
2744             }
2745             tes_prog_data->base.urb_read_length =
2746                MAX2(tes_prog_data->base.urb_read_length,
2747                     DIV_ROUND_UP(imm_offset + 1, 2));
2748          } else {
2749             /* Replicate the patch handle to all enabled channels */
2750             const fs_reg srcs[] = {
2751                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2752             };
2753             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2754             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2755
2756             if (first_component != 0) {
2757                unsigned read_components =
2758                   instr->num_components + first_component;
2759                fs_reg tmp = bld.vgrf(dest.type, read_components);
2760                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2761                                patch_handle);
2762                inst->size_written = read_components * REG_SIZE;
2763                for (unsigned i = 0; i < instr->num_components; i++) {
2764                   bld.MOV(offset(dest, bld, i),
2765                           offset(tmp, bld, i + first_component));
2766                }
2767             } else {
2768                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2769                                patch_handle);
2770                inst->size_written = instr->num_components * REG_SIZE;
2771             }
2772             inst->mlen = 1;
2773             inst->offset = imm_offset;
2774          }
2775       } else {
2776          /* Indirect indexing - use per-slot offsets as well. */
2777
2778          /* We can only read two double components with each URB read, so
2779           * we send two read messages in that case, each one loading up to
2780           * two double components.
2781           */
2782          unsigned num_iterations = 1;
2783          unsigned num_components = instr->num_components;
2784          fs_reg orig_dest = dest;
2785          if (type_sz(dest.type) == 8) {
2786             if (instr->num_components > 2) {
2787                num_iterations = 2;
2788                num_components = 2;
2789             }
2790             fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2791             dest = tmp;
2792          }
2793
2794          for (unsigned iter = 0; iter < num_iterations; iter++) {
2795             const fs_reg srcs[] = {
2796                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2797                indirect_offset
2798             };
2799             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2800             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2801
2802             if (first_component != 0) {
2803                unsigned read_components =
2804                    num_components + first_component;
2805                fs_reg tmp = bld.vgrf(dest.type, read_components);
2806                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2807                                payload);
2808                for (unsigned i = 0; i < num_components; i++) {
2809                   bld.MOV(offset(dest, bld, i),
2810                           offset(tmp, bld, i + first_component));
2811                }
2812             } else {
2813                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2814                                payload);
2815             }
2816             inst->mlen = 2;
2817             inst->offset = imm_offset;
2818             inst->size_written = (num_components + first_component) *
2819                                  inst->dst.component_size(inst->exec_size);
2820
2821             /* If we are reading 64-bit data using 32-bit read messages we need
2822              * build proper 64-bit data elements by shuffling the low and high
2823              * 32-bit components around like we do for other things like UBOs
2824              * or SSBOs.
2825              */
2826             if (type_sz(dest.type) == 8) {
2827                shuffle_32bit_load_result_to_64bit_data(
2828                   bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
2829
2830                for (unsigned c = 0; c < num_components; c++) {
2831                   bld.MOV(offset(orig_dest, bld, iter * 2 + c),
2832                           offset(dest, bld, c));
2833                }
2834             }
2835
2836             /* If we are loading double data and we need a second read message
2837              * adjust the offset
2838              */
2839             if (num_iterations > 1) {
2840                num_components = instr->num_components - 2;
2841                imm_offset++;
2842             }
2843          }
2844       }
2845       break;
2846    }
2847    default:
2848       nir_emit_intrinsic(bld, instr);
2849       break;
2850    }
2851 }
2852
2853 void
2854 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
2855                                   nir_intrinsic_instr *instr)
2856 {
2857    assert(stage == MESA_SHADER_GEOMETRY);
2858    fs_reg indirect_offset;
2859
2860    fs_reg dest;
2861    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2862       dest = get_nir_dest(instr->dest);
2863
2864    switch (instr->intrinsic) {
2865    case nir_intrinsic_load_primitive_id:
2866       assert(stage == MESA_SHADER_GEOMETRY);
2867       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
2868       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
2869               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
2870       break;
2871
2872    case nir_intrinsic_load_input:
2873       unreachable("load_input intrinsics are invalid for the GS stage");
2874
2875    case nir_intrinsic_load_per_vertex_input:
2876       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
2877                          instr->src[1], instr->num_components,
2878                          nir_intrinsic_component(instr));
2879       break;
2880
2881    case nir_intrinsic_emit_vertex_with_counter:
2882       emit_gs_vertex(instr->src[0], instr->const_index[0]);
2883       break;
2884
2885    case nir_intrinsic_end_primitive_with_counter:
2886       emit_gs_end_primitive(instr->src[0]);
2887       break;
2888
2889    case nir_intrinsic_set_vertex_count:
2890       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
2891       break;
2892
2893    case nir_intrinsic_load_invocation_id: {
2894       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
2895       assert(val.file != BAD_FILE);
2896       dest.type = val.type;
2897       bld.MOV(dest, val);
2898       break;
2899    }
2900
2901    default:
2902       nir_emit_intrinsic(bld, instr);
2903       break;
2904    }
2905 }
2906
2907 /**
2908  * Fetch the current render target layer index.
2909  */
2910 static fs_reg
2911 fetch_render_target_array_index(const fs_builder &bld)
2912 {
2913    if (bld.shader->devinfo->gen >= 6) {
2914       /* The render target array index is provided in the thread payload as
2915        * bits 26:16 of r0.0.
2916        */
2917       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
2918       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
2919               brw_imm_uw(0x7ff));
2920       return idx;
2921    } else {
2922       /* Pre-SNB we only ever render into the first layer of the framebuffer
2923        * since layered rendering is not implemented.
2924        */
2925       return brw_imm_ud(0);
2926    }
2927 }
2928
2929 /**
2930  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
2931  * framebuffer at the current fragment coordinates and sample index.
2932  */
2933 fs_inst *
2934 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
2935                                       unsigned target)
2936 {
2937    const struct gen_device_info *devinfo = bld.shader->devinfo;
2938
2939    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
2940    const brw_wm_prog_key *wm_key =
2941       reinterpret_cast<const brw_wm_prog_key *>(key);
2942    assert(!wm_key->coherent_fb_fetch);
2943    const struct brw_wm_prog_data *wm_prog_data =
2944       brw_wm_prog_data(stage_prog_data);
2945
2946    /* Calculate the surface index relative to the start of the texture binding
2947     * table block, since that's what the texturing messages expect.
2948     */
2949    const unsigned surface = target +
2950       wm_prog_data->binding_table.render_target_read_start -
2951       wm_prog_data->base.binding_table.texture_start;
2952
2953    brw_mark_surface_used(
2954       bld.shader->stage_prog_data,
2955       wm_prog_data->binding_table.render_target_read_start + target);
2956
2957    /* Calculate the fragment coordinates. */
2958    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
2959    bld.MOV(offset(coords, bld, 0), pixel_x);
2960    bld.MOV(offset(coords, bld, 1), pixel_y);
2961    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
2962
2963    /* Calculate the sample index and MCS payload when multisampling.  Luckily
2964     * the MCS fetch message behaves deterministically for UMS surfaces, so it
2965     * shouldn't be necessary to recompile based on whether the framebuffer is
2966     * CMS or UMS.
2967     */
2968    if (wm_key->multisample_fbo &&
2969        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
2970       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
2971
2972    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
2973    const fs_reg mcs = wm_key->multisample_fbo ?
2974       emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
2975
2976    /* Use either a normal or a CMS texel fetch message depending on whether
2977     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
2978     * message just in case the framebuffer uses 16x multisampling, it should
2979     * be equivalent to the normal CMS fetch for lower multisampling modes.
2980     */
2981    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
2982                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
2983                      SHADER_OPCODE_TXF_CMS_LOGICAL;
2984
2985    /* Emit the instruction. */
2986    const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
2987                            sample, mcs,
2988                            brw_imm_ud(surface), brw_imm_ud(0),
2989                            fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
2990    STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
2991
2992    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
2993    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
2994
2995    return inst;
2996 }
2997
2998 /**
2999  * Actual coherent framebuffer read implemented using the native render target
3000  * read message.  Requires SKL+.
3001  */
3002 static fs_inst *
3003 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3004 {
3005    assert(bld.shader->devinfo->gen >= 9);
3006    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3007    inst->target = target;
3008    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3009
3010    return inst;
3011 }
3012
3013 static fs_reg
3014 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3015 {
3016    if (n && regs[0].file != BAD_FILE) {
3017       return regs[0];
3018
3019    } else {
3020       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3021
3022       for (unsigned i = 0; i < n; i++)
3023          regs[i] = tmp;
3024
3025       return tmp;
3026    }
3027 }
3028
3029 static fs_reg
3030 alloc_frag_output(fs_visitor *v, unsigned location)
3031 {
3032    assert(v->stage == MESA_SHADER_FRAGMENT);
3033    const brw_wm_prog_key *const key =
3034       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3035    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3036    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3037
3038    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3039       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3040
3041    else if (l == FRAG_RESULT_COLOR)
3042       return alloc_temporary(v->bld, 4, v->outputs,
3043                              MAX2(key->nr_color_regions, 1));
3044
3045    else if (l == FRAG_RESULT_DEPTH)
3046       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3047
3048    else if (l == FRAG_RESULT_STENCIL)
3049       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3050
3051    else if (l == FRAG_RESULT_SAMPLE_MASK)
3052       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3053
3054    else if (l >= FRAG_RESULT_DATA0 &&
3055             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3056       return alloc_temporary(v->bld, 4,
3057                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3058
3059    else
3060       unreachable("Invalid location");
3061 }
3062
3063 void
3064 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3065                                   nir_intrinsic_instr *instr)
3066 {
3067    assert(stage == MESA_SHADER_FRAGMENT);
3068
3069    fs_reg dest;
3070    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3071       dest = get_nir_dest(instr->dest);
3072
3073    switch (instr->intrinsic) {
3074    case nir_intrinsic_load_front_face:
3075       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3076               *emit_frontfacing_interpolation());
3077       break;
3078
3079    case nir_intrinsic_load_sample_pos: {
3080       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3081       assert(sample_pos.file != BAD_FILE);
3082       dest.type = sample_pos.type;
3083       bld.MOV(dest, sample_pos);
3084       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3085       break;
3086    }
3087
3088    case nir_intrinsic_load_layer_id:
3089       dest.type = BRW_REGISTER_TYPE_UD;
3090       bld.MOV(dest, fetch_render_target_array_index(bld));
3091       break;
3092
3093    case nir_intrinsic_load_helper_invocation:
3094    case nir_intrinsic_load_sample_mask_in:
3095    case nir_intrinsic_load_sample_id: {
3096       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3097       fs_reg val = nir_system_values[sv];
3098       assert(val.file != BAD_FILE);
3099       dest.type = val.type;
3100       bld.MOV(dest, val);
3101       break;
3102    }
3103
3104    case nir_intrinsic_store_output: {
3105       const fs_reg src = get_nir_src(instr->src[0]);
3106       const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3107       assert(const_offset && "Indirect output stores not allowed");
3108       const unsigned location = nir_intrinsic_base(instr) +
3109          SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
3110       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3111                                      src.type);
3112
3113       for (unsigned j = 0; j < instr->num_components; j++)
3114          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3115                  offset(src, bld, j));
3116
3117       break;
3118    }
3119
3120    case nir_intrinsic_load_output: {
3121       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3122                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3123       assert(l >= FRAG_RESULT_DATA0);
3124       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3125       assert(const_offset && "Indirect output loads not allowed");
3126       const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
3127       const fs_reg tmp = bld.vgrf(dest.type, 4);
3128
3129       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3130          emit_coherent_fb_read(bld, tmp, target);
3131       else
3132          emit_non_coherent_fb_read(bld, tmp, target);
3133
3134       for (unsigned j = 0; j < instr->num_components; j++) {
3135          bld.MOV(offset(dest, bld, j),
3136                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3137       }
3138
3139       break;
3140    }
3141
3142    case nir_intrinsic_discard:
3143    case nir_intrinsic_discard_if: {
3144       /* We track our discarded pixels in f0.1.  By predicating on it, we can
3145        * update just the flag bits that aren't yet discarded.  If there's no
3146        * condition, we emit a CMP of g0 != g0, so all currently executing
3147        * channels will get turned off.
3148        */
3149       fs_inst *cmp;
3150       if (instr->intrinsic == nir_intrinsic_discard_if) {
3151          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3152                        brw_imm_d(0), BRW_CONDITIONAL_Z);
3153       } else {
3154          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3155                                        BRW_REGISTER_TYPE_UW));
3156          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3157       }
3158       cmp->predicate = BRW_PREDICATE_NORMAL;
3159       cmp->flag_subreg = 1;
3160
3161       if (devinfo->gen >= 6) {
3162          emit_discard_jump();
3163       }
3164       break;
3165    }
3166
3167    case nir_intrinsic_load_input: {
3168       /* load_input is only used for flat inputs */
3169       unsigned base = nir_intrinsic_base(instr);
3170       unsigned component = nir_intrinsic_component(instr);
3171       unsigned num_components = instr->num_components;
3172       enum brw_reg_type type = dest.type;
3173
3174       /* Special case fields in the VUE header */
3175       if (base == VARYING_SLOT_LAYER)
3176          component = 1;
3177       else if (base == VARYING_SLOT_VIEWPORT)
3178          component = 2;
3179
3180       if (nir_dest_bit_size(instr->dest) == 64) {
3181          /* const_index is in 32-bit type size units that could not be aligned
3182           * with DF. We need to read the double vector as if it was a float
3183           * vector of twice the number of components to fetch the right data.
3184           */
3185          type = BRW_REGISTER_TYPE_F;
3186          num_components *= 2;
3187       }
3188
3189       for (unsigned int i = 0; i < num_components; i++) {
3190          struct brw_reg interp = interp_reg(base, component + i);
3191          interp = suboffset(interp, 3);
3192          bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
3193                   retype(fs_reg(interp), type));
3194       }
3195
3196       if (nir_dest_bit_size(instr->dest) == 64) {
3197          shuffle_32bit_load_result_to_64bit_data(bld,
3198                                                  dest,
3199                                                  retype(dest, type),
3200                                                  instr->num_components);
3201       }
3202       break;
3203    }
3204
3205    case nir_intrinsic_load_barycentric_pixel:
3206    case nir_intrinsic_load_barycentric_centroid:
3207    case nir_intrinsic_load_barycentric_sample:
3208       /* Do nothing - load_interpolated_input handling will handle it later. */
3209       break;
3210
3211    case nir_intrinsic_load_barycentric_at_sample: {
3212       const glsl_interp_mode interpolation =
3213          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3214
3215       nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
3216
3217       if (const_sample) {
3218          unsigned msg_data = const_sample->i32[0] << 4;
3219
3220          emit_pixel_interpolater_send(bld,
3221                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3222                                       dest,
3223                                       fs_reg(), /* src */
3224                                       brw_imm_ud(msg_data),
3225                                       interpolation);
3226       } else {
3227          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3228                                           BRW_REGISTER_TYPE_UD);
3229
3230          if (nir_src_is_dynamically_uniform(instr->src[0])) {
3231             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3232             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3233             bld.exec_all().group(1, 0)
3234                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3235             emit_pixel_interpolater_send(bld,
3236                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3237                                          dest,
3238                                          fs_reg(), /* src */
3239                                          msg_data,
3240                                          interpolation);
3241          } else {
3242             /* Make a loop that sends a message to the pixel interpolater
3243              * for the sample number in each live channel. If there are
3244              * multiple channels with the same sample number then these
3245              * will be handled simultaneously with a single interation of
3246              * the loop.
3247              */
3248             bld.emit(BRW_OPCODE_DO);
3249
3250             /* Get the next live sample number into sample_id_reg */
3251             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3252
3253             /* Set the flag register so that we can perform the send
3254              * message on all channels that have the same sample number
3255              */
3256             bld.CMP(bld.null_reg_ud(),
3257                     sample_src, sample_id,
3258                     BRW_CONDITIONAL_EQ);
3259             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3260             bld.exec_all().group(1, 0)
3261                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3262             fs_inst *inst =
3263                emit_pixel_interpolater_send(bld,
3264                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3265                                             dest,
3266                                             fs_reg(), /* src */
3267                                             msg_data,
3268                                             interpolation);
3269             set_predicate(BRW_PREDICATE_NORMAL, inst);
3270
3271             /* Continue the loop if there are any live channels left */
3272             set_predicate_inv(BRW_PREDICATE_NORMAL,
3273                               true, /* inverse */
3274                               bld.emit(BRW_OPCODE_WHILE));
3275          }
3276       }
3277       break;
3278    }
3279
3280    case nir_intrinsic_load_barycentric_at_offset: {
3281       const glsl_interp_mode interpolation =
3282          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3283
3284       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3285
3286       if (const_offset) {
3287          unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
3288          unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
3289
3290          emit_pixel_interpolater_send(bld,
3291                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3292                                       dest,
3293                                       fs_reg(), /* src */
3294                                       brw_imm_ud(off_x | (off_y << 4)),
3295                                       interpolation);
3296       } else {
3297          fs_reg src = vgrf(glsl_type::ivec2_type);
3298          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3299                                     BRW_REGISTER_TYPE_F);
3300          for (int i = 0; i < 2; i++) {
3301             fs_reg temp = vgrf(glsl_type::float_type);
3302             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3303             fs_reg itemp = vgrf(glsl_type::int_type);
3304             /* float to int */
3305             bld.MOV(itemp, temp);
3306
3307             /* Clamp the upper end of the range to +7/16.
3308              * ARB_gpu_shader5 requires that we support a maximum offset
3309              * of +0.5, which isn't representable in a S0.4 value -- if
3310              * we didn't clamp it, we'd end up with -8/16, which is the
3311              * opposite of what the shader author wanted.
3312              *
3313              * This is legal due to ARB_gpu_shader5's quantization
3314              * rules:
3315              *
3316              * "Not all values of <offset> may be supported; x and y
3317              * offsets may be rounded to fixed-point values with the
3318              * number of fraction bits given by the
3319              * implementation-dependent constant
3320              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3321              */
3322             set_condmod(BRW_CONDITIONAL_L,
3323                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3324          }
3325
3326          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3327          emit_pixel_interpolater_send(bld,
3328                                       opcode,
3329                                       dest,
3330                                       src,
3331                                       brw_imm_ud(0u),
3332                                       interpolation);
3333       }
3334       break;
3335    }
3336
3337    case nir_intrinsic_load_interpolated_input: {
3338       if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3339          emit_fragcoord_interpolation(dest);
3340          break;
3341       }
3342
3343       assert(instr->src[0].ssa &&
3344              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3345       nir_intrinsic_instr *bary_intrinsic =
3346          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3347       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3348       enum glsl_interp_mode interp_mode =
3349          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3350       fs_reg dst_xy;
3351
3352       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3353           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3354          /* Use the result of the PI message */
3355          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3356       } else {
3357          /* Use the delta_xy values computed from the payload */
3358          enum brw_barycentric_mode bary =
3359             brw_barycentric_mode(interp_mode, bary_intrin);
3360
3361          dst_xy = this->delta_xy[bary];
3362       }
3363
3364       for (unsigned int i = 0; i < instr->num_components; i++) {
3365          fs_reg interp =
3366             fs_reg(interp_reg(nir_intrinsic_base(instr),
3367                               nir_intrinsic_component(instr) + i));
3368          interp.type = BRW_REGISTER_TYPE_F;
3369          dest.type = BRW_REGISTER_TYPE_F;
3370
3371          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3372             fs_reg tmp = vgrf(glsl_type::float_type);
3373             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3374             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3375          } else {
3376             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3377          }
3378       }
3379       break;
3380    }
3381
3382    default:
3383       nir_emit_intrinsic(bld, instr);
3384       break;
3385    }
3386 }
3387
3388 void
3389 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3390                                   nir_intrinsic_instr *instr)
3391 {
3392    assert(stage == MESA_SHADER_COMPUTE);
3393    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3394
3395    fs_reg dest;
3396    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3397       dest = get_nir_dest(instr->dest);
3398
3399    switch (instr->intrinsic) {
3400    case nir_intrinsic_barrier:
3401       emit_barrier();
3402       cs_prog_data->uses_barrier = true;
3403       break;
3404
3405    case nir_intrinsic_load_local_invocation_id:
3406    case nir_intrinsic_load_work_group_id: {
3407       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3408       fs_reg val = nir_system_values[sv];
3409       assert(val.file != BAD_FILE);
3410       dest.type = val.type;
3411       for (unsigned i = 0; i < 3; i++)
3412          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3413       break;
3414    }
3415
3416    case nir_intrinsic_load_num_work_groups: {
3417       const unsigned surface =
3418          cs_prog_data->binding_table.work_groups_start;
3419
3420       cs_prog_data->uses_num_work_groups = true;
3421
3422       fs_reg surf_index = brw_imm_ud(surface);
3423       brw_mark_surface_used(prog_data, surface);
3424
3425       /* Read the 3 GLuint components of gl_NumWorkGroups */
3426       for (unsigned i = 0; i < 3; i++) {
3427          fs_reg read_result =
3428             emit_untyped_read(bld, surf_index,
3429                               brw_imm_ud(i << 2),
3430                               1 /* dims */, 1 /* size */,
3431                               BRW_PREDICATE_NONE);
3432          read_result.type = dest.type;
3433          bld.MOV(dest, read_result);
3434          dest = offset(dest, bld, 1);
3435       }
3436       break;
3437    }
3438
3439    case nir_intrinsic_shared_atomic_add:
3440       nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr);
3441       break;
3442    case nir_intrinsic_shared_atomic_imin:
3443       nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3444       break;
3445    case nir_intrinsic_shared_atomic_umin:
3446       nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3447       break;
3448    case nir_intrinsic_shared_atomic_imax:
3449       nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3450       break;
3451    case nir_intrinsic_shared_atomic_umax:
3452       nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3453       break;
3454    case nir_intrinsic_shared_atomic_and:
3455       nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3456       break;
3457    case nir_intrinsic_shared_atomic_or:
3458       nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3459       break;
3460    case nir_intrinsic_shared_atomic_xor:
3461       nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3462       break;
3463    case nir_intrinsic_shared_atomic_exchange:
3464       nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3465       break;
3466    case nir_intrinsic_shared_atomic_comp_swap:
3467       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3468       break;
3469
3470    case nir_intrinsic_load_shared: {
3471       assert(devinfo->gen >= 7);
3472
3473       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3474
3475       /* Get the offset to read from */
3476       fs_reg offset_reg;
3477       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3478       if (const_offset) {
3479          offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
3480       } else {
3481          offset_reg = vgrf(glsl_type::uint_type);
3482          bld.ADD(offset_reg,
3483                  retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
3484                  brw_imm_ud(instr->const_index[0]));
3485       }
3486
3487       /* Read the vector */
3488       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3489                              instr->num_components);
3490       break;
3491    }
3492
3493    case nir_intrinsic_store_shared: {
3494       assert(devinfo->gen >= 7);
3495
3496       /* Block index */
3497       fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
3498
3499       /* Value */
3500       fs_reg val_reg = get_nir_src(instr->src[0]);
3501
3502       /* Writemask */
3503       unsigned writemask = instr->const_index[1];
3504
3505       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3506        * since the untyped writes below operate in units of 32-bits, which
3507        * means that we need to write twice as many components each time.
3508        * Also, we have to suffle 64-bit data to be in the appropriate layout
3509        * expected by our 32-bit write messages.
3510        */
3511       unsigned type_size = 4;
3512       unsigned bit_size = instr->src[0].is_ssa ?
3513          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
3514       if (bit_size == 64) {
3515          type_size = 8;
3516          fs_reg tmp =
3517            fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
3518          shuffle_64bit_data_for_32bit_write(
3519             bld,
3520             retype(tmp, BRW_REGISTER_TYPE_F),
3521             retype(val_reg, BRW_REGISTER_TYPE_DF),
3522             instr->num_components);
3523          val_reg = tmp;
3524       }
3525
3526       unsigned type_slots = type_size / 4;
3527
3528       /* Combine groups of consecutive enabled channels in one write
3529        * message. We use ffs to find the first enabled channel and then ffs on
3530        * the bit-inverse, down-shifted writemask to determine the length of
3531        * the block of enabled bits.
3532        */
3533       while (writemask) {
3534          unsigned first_component = ffs(writemask) - 1;
3535          unsigned length = ffs(~(writemask >> first_component)) - 1;
3536
3537          /* We can't write more than 2 64-bit components at once. Limit the
3538           * length of the write to what we can do and let the next iteration
3539           * handle the rest
3540           */
3541          if (type_size > 4)
3542             length = MIN2(2, length);
3543
3544          fs_reg offset_reg;
3545          nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3546          if (const_offset) {
3547             offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
3548                                     type_size * first_component);
3549          } else {
3550             offset_reg = vgrf(glsl_type::uint_type);
3551             bld.ADD(offset_reg,
3552                     retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
3553                     brw_imm_ud(instr->const_index[0] + type_size * first_component));
3554          }
3555
3556          emit_untyped_write(bld, surf_index, offset_reg,
3557                             offset(val_reg, bld, first_component * type_slots),
3558                             1 /* dims */, length * type_slots,
3559                             BRW_PREDICATE_NONE);
3560
3561          /* Clear the bits in the writemask that we just wrote, then try
3562           * again to see if more channels are left.
3563           */
3564          writemask &= (15 << (first_component + length));
3565       }
3566
3567       break;
3568    }
3569
3570    default:
3571       nir_emit_intrinsic(bld, instr);
3572       break;
3573    }
3574 }
3575
3576 void
3577 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3578 {
3579    fs_reg dest;
3580    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3581       dest = get_nir_dest(instr->dest);
3582
3583    switch (instr->intrinsic) {
3584    case nir_intrinsic_atomic_counter_inc:
3585    case nir_intrinsic_atomic_counter_dec:
3586    case nir_intrinsic_atomic_counter_read:
3587    case nir_intrinsic_atomic_counter_add:
3588    case nir_intrinsic_atomic_counter_min:
3589    case nir_intrinsic_atomic_counter_max:
3590    case nir_intrinsic_atomic_counter_and:
3591    case nir_intrinsic_atomic_counter_or:
3592    case nir_intrinsic_atomic_counter_xor:
3593    case nir_intrinsic_atomic_counter_exchange:
3594    case nir_intrinsic_atomic_counter_comp_swap: {
3595       if (stage == MESA_SHADER_FRAGMENT &&
3596           instr->intrinsic != nir_intrinsic_atomic_counter_read)
3597          brw_wm_prog_data(prog_data)->has_side_effects = true;
3598
3599       /* Get some metadata from the image intrinsic. */
3600       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3601
3602       /* Get the arguments of the atomic intrinsic. */
3603       const fs_reg offset = get_nir_src(instr->src[0]);
3604       const unsigned surface = (stage_prog_data->binding_table.abo_start +
3605                                 instr->const_index[0]);
3606       const fs_reg src0 = (info->num_srcs >= 2
3607                            ? get_nir_src(instr->src[1]) : fs_reg());
3608       const fs_reg src1 = (info->num_srcs >= 3
3609                            ? get_nir_src(instr->src[2]) : fs_reg());
3610       fs_reg tmp;
3611
3612       assert(info->num_srcs <= 3);
3613
3614       /* Emit a surface read or atomic op. */
3615       if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
3616          tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
3617       } else {
3618          tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
3619                                    src1, 1, 1,
3620                                    get_atomic_counter_op(instr->intrinsic));
3621       }
3622
3623       /* Assign the result. */
3624       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
3625
3626       /* Mark the surface as used. */
3627       brw_mark_surface_used(stage_prog_data, surface);
3628       break;
3629    }
3630
3631    case nir_intrinsic_image_load:
3632    case nir_intrinsic_image_store:
3633    case nir_intrinsic_image_atomic_add:
3634    case nir_intrinsic_image_atomic_min:
3635    case nir_intrinsic_image_atomic_max:
3636    case nir_intrinsic_image_atomic_and:
3637    case nir_intrinsic_image_atomic_or:
3638    case nir_intrinsic_image_atomic_xor:
3639    case nir_intrinsic_image_atomic_exchange:
3640    case nir_intrinsic_image_atomic_comp_swap: {
3641       using namespace image_access;
3642
3643       if (stage == MESA_SHADER_FRAGMENT &&
3644           instr->intrinsic != nir_intrinsic_image_load)
3645          brw_wm_prog_data(prog_data)->has_side_effects = true;
3646
3647       /* Get the referenced image variable and type. */
3648       const nir_variable *var = instr->variables[0]->var;
3649       const glsl_type *type = var->type->without_array();
3650       const brw_reg_type base_type = get_image_base_type(type);
3651
3652       /* Get some metadata from the image intrinsic. */
3653       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3654       const unsigned arr_dims = type->sampler_array ? 1 : 0;
3655       const unsigned surf_dims = type->coordinate_components() - arr_dims;
3656       const unsigned format = var->data.image.format;
3657
3658       /* Get the arguments of the image intrinsic. */
3659       const fs_reg image = get_nir_image_deref(instr->variables[0]);
3660       const fs_reg addr = retype(get_nir_src(instr->src[0]),
3661                                  BRW_REGISTER_TYPE_UD);
3662       const fs_reg src0 = (info->num_srcs >= 3 ?
3663                            retype(get_nir_src(instr->src[2]), base_type) :
3664                            fs_reg());
3665       const fs_reg src1 = (info->num_srcs >= 4 ?
3666                            retype(get_nir_src(instr->src[3]), base_type) :
3667                            fs_reg());
3668       fs_reg tmp;
3669
3670       /* Emit an image load, store or atomic op. */
3671       if (instr->intrinsic == nir_intrinsic_image_load)
3672          tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
3673
3674       else if (instr->intrinsic == nir_intrinsic_image_store)
3675          emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
3676                           var->data.image.write_only ? GL_NONE : format);
3677
3678       else
3679          tmp = emit_image_atomic(bld, image, addr, src0, src1,
3680                                  surf_dims, arr_dims, info->dest_components,
3681                                  get_image_atomic_op(instr->intrinsic, type));
3682
3683       /* Assign the result. */
3684       for (unsigned c = 0; c < info->dest_components; ++c)
3685          bld.MOV(offset(retype(dest, base_type), bld, c),
3686                  offset(tmp, bld, c));
3687       break;
3688    }
3689
3690    case nir_intrinsic_memory_barrier_atomic_counter:
3691    case nir_intrinsic_memory_barrier_buffer:
3692    case nir_intrinsic_memory_barrier_image:
3693    case nir_intrinsic_memory_barrier: {
3694       const fs_builder ubld = bld.group(8, 0);
3695       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3696       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3697          ->size_written = 2 * REG_SIZE;
3698       break;
3699    }
3700
3701    case nir_intrinsic_group_memory_barrier:
3702    case nir_intrinsic_memory_barrier_shared:
3703       /* We treat these workgroup-level barriers as no-ops.  This should be
3704        * safe at present and as long as:
3705        *
3706        *  - Memory access instructions are not subsequently reordered by the
3707        *    compiler back-end.
3708        *
3709        *  - All threads from a given compute shader workgroup fit within a
3710        *    single subslice and therefore talk to the same HDC shared unit
3711        *    what supposedly guarantees ordering and coherency between threads
3712        *    from the same workgroup.  This may change in the future when we
3713        *    start splitting workgroups across multiple subslices.
3714        *
3715        *  - The context is not in fault-and-stream mode, which could cause
3716        *    memory transactions (including to SLM) prior to the barrier to be
3717        *    replayed after the barrier if a pagefault occurs.  This shouldn't
3718        *    be a problem up to and including SKL because fault-and-stream is
3719        *    not usable due to hardware issues, but that's likely to change in
3720        *    the future.
3721        */
3722       break;
3723
3724    case nir_intrinsic_shader_clock: {
3725       /* We cannot do anything if there is an event, so ignore it for now */
3726       const fs_reg shader_clock = get_timestamp(bld);
3727       const fs_reg srcs[] = { component(shader_clock, 0),
3728                               component(shader_clock, 1) };
3729       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3730       break;
3731    }
3732
3733    case nir_intrinsic_image_size: {
3734       /* Get the referenced image variable and type. */
3735       const nir_variable *var = instr->variables[0]->var;
3736       const glsl_type *type = var->type->without_array();
3737
3738       /* Get the size of the image. */
3739       const fs_reg image = get_nir_image_deref(instr->variables[0]);
3740       const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
3741
3742       /* For 1DArray image types, the array index is stored in the Z component.
3743        * Fix this by swizzling the Z component to the Y component.
3744        */
3745       const bool is_1d_array_image =
3746                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
3747                   type->sampler_array;
3748
3749       /* For CubeArray images, we should count the number of cubes instead
3750        * of the number of faces. Fix it by dividing the (Z component) by 6.
3751        */
3752       const bool is_cube_array_image =
3753                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
3754                   type->sampler_array;
3755
3756       /* Copy all the components. */
3757       for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
3758          if ((int)c >= type->coordinate_components()) {
3759              bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3760                      brw_imm_d(1));
3761          } else if (c == 1 && is_1d_array_image) {
3762             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3763                     offset(size, bld, 2));
3764          } else if (c == 2 && is_cube_array_image) {
3765             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3766                      offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3767                      offset(size, bld, c), brw_imm_d(6));
3768          } else {
3769             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
3770                     offset(size, bld, c));
3771          }
3772        }
3773
3774       break;
3775    }
3776
3777    case nir_intrinsic_image_samples:
3778       /* The driver does not support multi-sampled images. */
3779       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
3780       break;
3781
3782    case nir_intrinsic_load_uniform: {
3783       /* Offsets are in bytes but they should always be multiples of 4 */
3784       assert(instr->const_index[0] % 4 == 0);
3785
3786       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
3787
3788       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3789       if (const_offset) {
3790          /* Offsets are in bytes but they should always be multiples of 4 */
3791          assert(const_offset->u32[0] % 4 == 0);
3792          src.offset = const_offset->u32[0];
3793
3794          for (unsigned j = 0; j < instr->num_components; j++) {
3795             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
3796          }
3797       } else {
3798          fs_reg indirect = retype(get_nir_src(instr->src[0]),
3799                                   BRW_REGISTER_TYPE_UD);
3800
3801          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
3802           * go past the end of the uniform.  In order to keep the n'th
3803           * component from running past, we subtract off the size of all but
3804           * one component of the vector.
3805           */
3806          assert(instr->const_index[1] >=
3807                 instr->num_components * (int) type_sz(dest.type));
3808          unsigned read_size = instr->const_index[1] -
3809             (instr->num_components - 1) * type_sz(dest.type);
3810
3811          bool supports_64bit_indirects =
3812             !devinfo->is_cherryview && !devinfo->is_broxton;
3813
3814          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
3815             for (unsigned j = 0; j < instr->num_components; j++) {
3816                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3817                         offset(dest, bld, j), offset(src, bld, j),
3818                         indirect, brw_imm_ud(read_size));
3819             }
3820          } else {
3821             const unsigned num_mov_indirects =
3822                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
3823             /* We read a little bit less per MOV INDIRECT, as they are now
3824              * 32-bits ones instead of 64-bit. Fix read_size then.
3825              */
3826             const unsigned read_size_32bit = read_size -
3827                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
3828             for (unsigned j = 0; j < instr->num_components; j++) {
3829                for (unsigned i = 0; i < num_mov_indirects; i++) {
3830                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
3831                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
3832                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
3833                            indirect, brw_imm_ud(read_size_32bit));
3834                }
3835             }
3836          }
3837       }
3838       break;
3839    }
3840
3841    case nir_intrinsic_load_ubo: {
3842       nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
3843       fs_reg surf_index;
3844
3845       if (const_index) {
3846          const unsigned index = stage_prog_data->binding_table.ubo_start +
3847                                 const_index->u32[0];
3848          surf_index = brw_imm_ud(index);
3849          brw_mark_surface_used(prog_data, index);
3850       } else {
3851          /* The block index is not a constant. Evaluate the index expression
3852           * per-channel and add the base UBO index; we have to select a value
3853           * from any live channel.
3854           */
3855          surf_index = vgrf(glsl_type::uint_type);
3856          bld.ADD(surf_index, get_nir_src(instr->src[0]),
3857                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
3858          surf_index = bld.emit_uniformize(surf_index);
3859
3860          /* Assume this may touch any UBO. It would be nice to provide
3861           * a tighter bound, but the array information is already lowered away.
3862           */
3863          brw_mark_surface_used(prog_data,
3864                                stage_prog_data->binding_table.ubo_start +
3865                                nir->info->num_ubos - 1);
3866       }
3867
3868       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3869       if (const_offset == NULL) {
3870          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
3871                                      BRW_REGISTER_TYPE_UD);
3872
3873          for (int i = 0; i < instr->num_components; i++)
3874             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
3875                                        base_offset, i * type_sz(dest.type));
3876       } else {
3877          /* Even if we are loading doubles, a pull constant load will load
3878           * a 32-bit vec4, so should only reserve vgrf space for that. If we
3879           * need to load a full dvec4 we will have to emit 2 loads. This is
3880           * similar to demote_pull_constants(), except that in that case we
3881           * see individual accesses to each component of the vector and then
3882           * we let CSE deal with duplicate loads. Here we see a vector access
3883           * and we have to split it if necessary.
3884           */
3885          const unsigned type_size = type_sz(dest.type);
3886          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
3887          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
3888          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3889
3890          for (unsigned c = 0; c < instr->num_components;) {
3891             const unsigned base = const_offset->u32[0] + c * type_size;
3892             /* Number of usable components in the next block-aligned load. */
3893             const unsigned count = MIN2(instr->num_components - c,
3894                                         (block_sz - base % block_sz) / type_size);
3895
3896             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
3897                       packed_consts, surf_index,
3898                       brw_imm_ud(base & ~(block_sz - 1)));
3899
3900             const fs_reg consts =
3901                retype(byte_offset(packed_consts, base & (block_sz - 1)),
3902                       dest.type);
3903
3904             for (unsigned d = 0; d < count; d++)
3905                bld.MOV(offset(dest, bld, c + d), component(consts, d));
3906
3907             c += count;
3908          }
3909       }
3910       break;
3911    }
3912
3913    case nir_intrinsic_load_ssbo: {
3914       assert(devinfo->gen >= 7);
3915
3916       nir_const_value *const_uniform_block =
3917          nir_src_as_const_value(instr->src[0]);
3918
3919       fs_reg surf_index;
3920       if (const_uniform_block) {
3921          unsigned index = stage_prog_data->binding_table.ssbo_start +
3922                           const_uniform_block->u32[0];
3923          surf_index = brw_imm_ud(index);
3924          brw_mark_surface_used(prog_data, index);
3925       } else {
3926          surf_index = vgrf(glsl_type::uint_type);
3927          bld.ADD(surf_index, get_nir_src(instr->src[0]),
3928                  brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3929
3930          /* Assume this may touch any UBO. It would be nice to provide
3931           * a tighter bound, but the array information is already lowered away.
3932           */
3933          brw_mark_surface_used(prog_data,
3934                                stage_prog_data->binding_table.ssbo_start +
3935                                nir->info->num_ssbos - 1);
3936       }
3937
3938       fs_reg offset_reg;
3939       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
3940       if (const_offset) {
3941          offset_reg = brw_imm_ud(const_offset->u32[0]);
3942       } else {
3943          offset_reg = get_nir_src(instr->src[1]);
3944       }
3945
3946       /* Read the vector */
3947       do_untyped_vector_read(bld, dest, surf_index, offset_reg,
3948                              instr->num_components);
3949
3950       break;
3951    }
3952
3953    case nir_intrinsic_store_ssbo: {
3954       assert(devinfo->gen >= 7);
3955
3956       if (stage == MESA_SHADER_FRAGMENT)
3957          brw_wm_prog_data(prog_data)->has_side_effects = true;
3958
3959       /* Block index */
3960       fs_reg surf_index;
3961       nir_const_value *const_uniform_block =
3962          nir_src_as_const_value(instr->src[1]);
3963       if (const_uniform_block) {
3964          unsigned index = stage_prog_data->binding_table.ssbo_start +
3965                           const_uniform_block->u32[0];
3966          surf_index = brw_imm_ud(index);
3967          brw_mark_surface_used(prog_data, index);
3968       } else {
3969          surf_index = vgrf(glsl_type::uint_type);
3970          bld.ADD(surf_index, get_nir_src(instr->src[1]),
3971                   brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3972
3973          brw_mark_surface_used(prog_data,
3974                                stage_prog_data->binding_table.ssbo_start +
3975                                nir->info->num_ssbos - 1);
3976       }
3977
3978       /* Value */
3979       fs_reg val_reg = get_nir_src(instr->src[0]);
3980
3981       /* Writemask */
3982       unsigned writemask = instr->const_index[0];
3983
3984       /* get_nir_src() retypes to integer. Be wary of 64-bit types though
3985        * since the untyped writes below operate in units of 32-bits, which
3986        * means that we need to write twice as many components each time.
3987        * Also, we have to suffle 64-bit data to be in the appropriate layout
3988        * expected by our 32-bit write messages.
3989        */
3990       unsigned type_size = 4;
3991       unsigned bit_size = instr->src[0].is_ssa ?
3992          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
3993       if (bit_size == 64) {
3994          type_size = 8;
3995          fs_reg tmp =
3996            fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
3997          shuffle_64bit_data_for_32bit_write(bld,
3998             retype(tmp, BRW_REGISTER_TYPE_F),
3999             retype(val_reg, BRW_REGISTER_TYPE_DF),
4000             instr->num_components);
4001          val_reg = tmp;
4002       }
4003
4004       unsigned type_slots = type_size / 4;
4005
4006       /* Combine groups of consecutive enabled channels in one write
4007        * message. We use ffs to find the first enabled channel and then ffs on
4008        * the bit-inverse, down-shifted writemask to determine the length of
4009        * the block of enabled bits.
4010        */
4011       while (writemask) {
4012          unsigned first_component = ffs(writemask) - 1;
4013          unsigned length = ffs(~(writemask >> first_component)) - 1;
4014
4015          /* We can't write more than 2 64-bit components at once. Limit the
4016           * length of the write to what we can do and let the next iteration
4017           * handle the rest
4018           */
4019          if (type_size > 4)
4020             length = MIN2(2, length);
4021
4022          fs_reg offset_reg;
4023          nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
4024          if (const_offset) {
4025             offset_reg = brw_imm_ud(const_offset->u32[0] +
4026                                     type_size * first_component);
4027          } else {
4028             offset_reg = vgrf(glsl_type::uint_type);
4029             bld.ADD(offset_reg,
4030                     retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
4031                     brw_imm_ud(type_size * first_component));
4032          }
4033
4034
4035          emit_untyped_write(bld, surf_index, offset_reg,
4036                             offset(val_reg, bld, first_component * type_slots),
4037                             1 /* dims */, length * type_slots,
4038                             BRW_PREDICATE_NONE);
4039
4040          /* Clear the bits in the writemask that we just wrote, then try
4041           * again to see if more channels are left.
4042           */
4043          writemask &= (15 << (first_component + length));
4044       }
4045       break;
4046    }
4047
4048    case nir_intrinsic_store_output: {
4049       fs_reg src = get_nir_src(instr->src[0]);
4050
4051       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
4052       assert(const_offset && "Indirect output stores not allowed");
4053       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4054                                       4 * const_offset->u32[0]), src.type);
4055
4056       unsigned num_components = instr->num_components;
4057       unsigned first_component = nir_intrinsic_component(instr);
4058       unsigned bit_size = instr->src[0].is_ssa ?
4059          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
4060       if (bit_size == 64) {
4061          fs_reg tmp =
4062             fs_reg(VGRF, alloc.allocate(2 * num_components),
4063                    BRW_REGISTER_TYPE_F);
4064          shuffle_64bit_data_for_32bit_write(
4065             bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
4066          src = retype(tmp, src.type);
4067          num_components *= 2;
4068       }
4069
4070       for (unsigned j = 0; j < num_components; j++) {
4071          bld.MOV(offset(new_dest, bld, j + first_component),
4072                  offset(src, bld, j));
4073       }
4074       break;
4075    }
4076
4077    case nir_intrinsic_ssbo_atomic_add:
4078       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
4079       break;
4080    case nir_intrinsic_ssbo_atomic_imin:
4081       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4082       break;
4083    case nir_intrinsic_ssbo_atomic_umin:
4084       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4085       break;
4086    case nir_intrinsic_ssbo_atomic_imax:
4087       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4088       break;
4089    case nir_intrinsic_ssbo_atomic_umax:
4090       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4091       break;
4092    case nir_intrinsic_ssbo_atomic_and:
4093       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4094       break;
4095    case nir_intrinsic_ssbo_atomic_or:
4096       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4097       break;
4098    case nir_intrinsic_ssbo_atomic_xor:
4099       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4100       break;
4101    case nir_intrinsic_ssbo_atomic_exchange:
4102       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4103       break;
4104    case nir_intrinsic_ssbo_atomic_comp_swap:
4105       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4106       break;
4107
4108    case nir_intrinsic_get_buffer_size: {
4109       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
4110       unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
4111
4112       /* A resinfo's sampler message is used to get the buffer size.  The
4113        * SIMD8's writeback message consists of four registers and SIMD16's
4114        * writeback message consists of 8 destination registers (two per each
4115        * component).  Because we are only interested on the first channel of
4116        * the first returned component, where resinfo returns the buffer size
4117        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4118        * the dispatch width.
4119        */
4120       const fs_builder ubld = bld.exec_all().group(8, 0);
4121       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4122       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4123
4124       /* Set LOD = 0 */
4125       ubld.MOV(src_payload, brw_imm_d(0));
4126
4127       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4128       fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
4129                                 src_payload, brw_imm_ud(index));
4130       inst->header_size = 0;
4131       inst->mlen = 1;
4132       inst->size_written = 4 * REG_SIZE;
4133
4134       bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
4135       brw_mark_surface_used(prog_data, index);
4136       break;
4137    }
4138
4139    case nir_intrinsic_load_channel_num: {
4140       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
4141       dest = retype(dest, BRW_REGISTER_TYPE_UD);
4142       const fs_builder allbld8 = bld.group(8, 0).exec_all();
4143       allbld8.MOV(tmp, brw_imm_v(0x76543210));
4144       if (dispatch_width > 8)
4145          allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
4146       if (dispatch_width > 16) {
4147          const fs_builder allbld16 = bld.group(16, 0).exec_all();
4148          allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
4149       }
4150       bld.MOV(dest, tmp);
4151       break;
4152    }
4153
4154    default:
4155       unreachable("unknown intrinsic");
4156    }
4157 }
4158
4159 void
4160 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4161                                  int op, nir_intrinsic_instr *instr)
4162 {
4163    if (stage == MESA_SHADER_FRAGMENT)
4164       brw_wm_prog_data(prog_data)->has_side_effects = true;
4165
4166    fs_reg dest;
4167    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4168       dest = get_nir_dest(instr->dest);
4169
4170    fs_reg surface;
4171    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
4172    if (const_surface) {
4173       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
4174                             const_surface->u32[0];
4175       surface = brw_imm_ud(surf_index);
4176       brw_mark_surface_used(prog_data, surf_index);
4177    } else {
4178       surface = vgrf(glsl_type::uint_type);
4179       bld.ADD(surface, get_nir_src(instr->src[0]),
4180               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4181
4182       /* Assume this may touch any SSBO. This is the same we do for other
4183        * UBO/SSBO accesses with non-constant surface.
4184        */
4185       brw_mark_surface_used(prog_data,
4186                             stage_prog_data->binding_table.ssbo_start +
4187                             nir->info->num_ssbos - 1);
4188    }
4189
4190    fs_reg offset = get_nir_src(instr->src[1]);
4191    fs_reg data1 = get_nir_src(instr->src[2]);
4192    fs_reg data2;
4193    if (op == BRW_AOP_CMPWR)
4194       data2 = get_nir_src(instr->src[3]);
4195
4196    /* Emit the actual atomic operation */
4197
4198    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4199                                               data1, data2,
4200                                               1 /* dims */, 1 /* rsize */,
4201                                               op,
4202                                               BRW_PREDICATE_NONE);
4203    dest.type = atomic_result.type;
4204    bld.MOV(dest, atomic_result);
4205 }
4206
4207 void
4208 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4209                                    int op, nir_intrinsic_instr *instr)
4210 {
4211    fs_reg dest;
4212    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4213       dest = get_nir_dest(instr->dest);
4214
4215    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
4216    fs_reg offset;
4217    fs_reg data1 = get_nir_src(instr->src[1]);
4218    fs_reg data2;
4219    if (op == BRW_AOP_CMPWR)
4220       data2 = get_nir_src(instr->src[2]);
4221
4222    /* Get the offset */
4223    nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
4224    if (const_offset) {
4225       offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
4226    } else {
4227       offset = vgrf(glsl_type::uint_type);
4228       bld.ADD(offset,
4229               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4230               brw_imm_ud(instr->const_index[0]));
4231    }
4232
4233    /* Emit the actual atomic operation operation */
4234
4235    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
4236                                               data1, data2,
4237                                               1 /* dims */, 1 /* rsize */,
4238                                               op,
4239                                               BRW_PREDICATE_NONE);
4240    dest.type = atomic_result.type;
4241    bld.MOV(dest, atomic_result);
4242 }
4243
4244 void
4245 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
4246 {
4247    unsigned texture = instr->texture_index;
4248    unsigned sampler = instr->sampler_index;
4249
4250    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4251
4252    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
4253    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
4254
4255    int lod_components = 0;
4256
4257    /* The hardware requires a LOD for buffer textures */
4258    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
4259       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
4260
4261    uint32_t header_bits = 0;
4262    for (unsigned i = 0; i < instr->num_srcs; i++) {
4263       fs_reg src = get_nir_src(instr->src[i].src);
4264       switch (instr->src[i].src_type) {
4265       case nir_tex_src_bias:
4266          srcs[TEX_LOGICAL_SRC_LOD] =
4267             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4268          break;
4269       case nir_tex_src_comparator:
4270          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
4271          break;
4272       case nir_tex_src_coord:
4273          switch (instr->op) {
4274          case nir_texop_txf:
4275          case nir_texop_txf_ms:
4276          case nir_texop_txf_ms_mcs:
4277          case nir_texop_samples_identical:
4278             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
4279             break;
4280          default:
4281             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
4282             break;
4283          }
4284          break;
4285       case nir_tex_src_ddx:
4286          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
4287          lod_components = nir_tex_instr_src_size(instr, i);
4288          break;
4289       case nir_tex_src_ddy:
4290          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
4291          break;
4292       case nir_tex_src_lod:
4293          switch (instr->op) {
4294          case nir_texop_txs:
4295             srcs[TEX_LOGICAL_SRC_LOD] =
4296                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
4297             break;
4298          case nir_texop_txf:
4299             srcs[TEX_LOGICAL_SRC_LOD] =
4300                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
4301             break;
4302          default:
4303             srcs[TEX_LOGICAL_SRC_LOD] =
4304                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4305             break;
4306          }
4307          break;
4308       case nir_tex_src_ms_index:
4309          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
4310          break;
4311
4312       case nir_tex_src_offset: {
4313          nir_const_value *const_offset =
4314             nir_src_as_const_value(instr->src[i].src);
4315          unsigned offset_bits = 0;
4316          if (const_offset &&
4317              brw_texture_offset(const_offset->i32,
4318                                 nir_tex_instr_src_size(instr, i),
4319                                 &offset_bits)) {
4320             header_bits |= offset_bits;
4321          } else {
4322             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
4323                retype(src, BRW_REGISTER_TYPE_D);
4324          }
4325          break;
4326       }
4327
4328       case nir_tex_src_projector:
4329          unreachable("should be lowered");
4330
4331       case nir_tex_src_texture_offset: {
4332          /* Figure out the highest possible texture index and mark it as used */
4333          uint32_t max_used = texture + instr->texture_array_size - 1;
4334          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
4335             max_used += stage_prog_data->binding_table.gather_texture_start;
4336          } else {
4337             max_used += stage_prog_data->binding_table.texture_start;
4338          }
4339          brw_mark_surface_used(prog_data, max_used);
4340
4341          /* Emit code to evaluate the actual indexing expression */
4342          fs_reg tmp = vgrf(glsl_type::uint_type);
4343          bld.ADD(tmp, src, brw_imm_ud(texture));
4344          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
4345          break;
4346       }
4347
4348       case nir_tex_src_sampler_offset: {
4349          /* Emit code to evaluate the actual indexing expression */
4350          fs_reg tmp = vgrf(glsl_type::uint_type);
4351          bld.ADD(tmp, src, brw_imm_ud(sampler));
4352          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
4353          break;
4354       }
4355
4356       case nir_tex_src_ms_mcs:
4357          assert(instr->op == nir_texop_txf_ms);
4358          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
4359          break;
4360
4361       case nir_tex_src_plane: {
4362          nir_const_value *const_plane =
4363             nir_src_as_const_value(instr->src[i].src);
4364          const uint32_t plane = const_plane->u32[0];
4365          const uint32_t texture_index =
4366             instr->texture_index +
4367             stage_prog_data->binding_table.plane_start[plane] -
4368             stage_prog_data->binding_table.texture_start;
4369
4370          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
4371          break;
4372       }
4373
4374       default:
4375          unreachable("unknown texture source");
4376       }
4377    }
4378
4379    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
4380        (instr->op == nir_texop_txf_ms ||
4381         instr->op == nir_texop_samples_identical)) {
4382       if (devinfo->gen >= 7 &&
4383           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
4384          srcs[TEX_LOGICAL_SRC_MCS] =
4385             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
4386                            instr->coord_components,
4387                            srcs[TEX_LOGICAL_SRC_SURFACE]);
4388       } else {
4389          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
4390       }
4391    }
4392
4393    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
4394    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
4395
4396    if (instr->op == nir_texop_query_levels ||
4397        (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) {
4398       /* textureQueryLevels() and texture() are implemented in terms of TXS
4399        * and TXL respectively, so we need to pass a valid LOD argument.
4400        */
4401       assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE);
4402       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
4403    }
4404
4405    enum opcode opcode;
4406    switch (instr->op) {
4407    case nir_texop_tex:
4408       opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
4409                 SHADER_OPCODE_TXL_LOGICAL);
4410       break;
4411    case nir_texop_txb:
4412       opcode = FS_OPCODE_TXB_LOGICAL;
4413       break;
4414    case nir_texop_txl:
4415       opcode = SHADER_OPCODE_TXL_LOGICAL;
4416       break;
4417    case nir_texop_txd:
4418       opcode = SHADER_OPCODE_TXD_LOGICAL;
4419       break;
4420    case nir_texop_txf:
4421       opcode = SHADER_OPCODE_TXF_LOGICAL;
4422       break;
4423    case nir_texop_txf_ms:
4424       if ((key_tex->msaa_16 & (1 << sampler)))
4425          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
4426       else
4427          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
4428       break;
4429    case nir_texop_txf_ms_mcs:
4430       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
4431       break;
4432    case nir_texop_query_levels:
4433    case nir_texop_txs:
4434       opcode = SHADER_OPCODE_TXS_LOGICAL;
4435       break;
4436    case nir_texop_lod:
4437       opcode = SHADER_OPCODE_LOD_LOGICAL;
4438       break;
4439    case nir_texop_tg4:
4440       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
4441          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
4442       else
4443          opcode = SHADER_OPCODE_TG4_LOGICAL;
4444       break;
4445    case nir_texop_texture_samples:
4446       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
4447       break;
4448    case nir_texop_samples_identical: {
4449       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
4450
4451       /* If mcs is an immediate value, it means there is no MCS.  In that case
4452        * just return false.
4453        */
4454       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
4455          bld.MOV(dst, brw_imm_ud(0u));
4456       } else if ((key_tex->msaa_16 & (1 << sampler))) {
4457          fs_reg tmp = vgrf(glsl_type::uint_type);
4458          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
4459                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
4460          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
4461       } else {
4462          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
4463                  BRW_CONDITIONAL_EQ);
4464       }
4465       return;
4466    }
4467    default:
4468       unreachable("unknown texture opcode");
4469    }
4470
4471    if (instr->op == nir_texop_tg4) {
4472       if (instr->component == 1 &&
4473           key_tex->gather_channel_quirk_mask & (1 << texture)) {
4474          /* gather4 sampler is broken for green channel on RG32F --
4475           * we must ask for blue instead.
4476           */
4477          header_bits |= 2 << 16;
4478       } else {
4479          header_bits |= instr->component << 16;
4480       }
4481    }
4482
4483    fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
4484    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
4485    inst->offset = header_bits;
4486
4487    const unsigned dest_size = nir_tex_instr_dest_size(instr);
4488    if (devinfo->gen >= 9 &&
4489        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
4490       unsigned write_mask = instr->dest.is_ssa ?
4491                             nir_ssa_def_components_read(&instr->dest.ssa):
4492                             (1 << dest_size) - 1;
4493       assert(write_mask != 0); /* dead code should have been eliminated */
4494       inst->size_written = util_last_bit(write_mask) *
4495                            inst->dst.component_size(inst->exec_size);
4496    } else {
4497       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
4498    }
4499
4500    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
4501       inst->shadow_compare = true;
4502
4503    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
4504       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
4505
4506    fs_reg nir_dest[4];
4507    for (unsigned i = 0; i < dest_size; i++)
4508       nir_dest[i] = offset(dst, bld, i);
4509
4510    if (instr->op == nir_texop_query_levels) {
4511       /* # levels is in .w */
4512       nir_dest[0] = offset(dst, bld, 3);
4513    } else if (instr->op == nir_texop_txs &&
4514               dest_size >= 3 && devinfo->gen < 7) {
4515       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
4516       fs_reg depth = offset(dst, bld, 2);
4517       nir_dest[2] = vgrf(glsl_type::int_type);
4518       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
4519    }
4520
4521    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
4522 }
4523
4524 void
4525 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
4526 {
4527    switch (instr->type) {
4528    case nir_jump_break:
4529       bld.emit(BRW_OPCODE_BREAK);
4530       break;
4531    case nir_jump_continue:
4532       bld.emit(BRW_OPCODE_CONTINUE);
4533       break;
4534    case nir_jump_return:
4535    default:
4536       unreachable("unknown jump");
4537    }
4538 }
4539
4540 /**
4541  * This helper takes the result of a load operation that reads 32-bit elements
4542  * in this format:
4543  *
4544  * x x x x x x x x
4545  * y y y y y y y y
4546  * z z z z z z z z
4547  * w w w w w w w w
4548  *
4549  * and shuffles the data to get this:
4550  *
4551  * x y x y x y x y
4552  * x y x y x y x y
4553  * z w z w z w z w
4554  * z w z w z w z w
4555  *
4556  * Which is exactly what we want if the load is reading 64-bit components
4557  * like doubles, where x represents the low 32-bit of the x double component
4558  * and y represents the high 32-bit of the x double component (likewise with
4559  * z and w for double component y). The parameter @components represents
4560  * the number of 64-bit components present in @src. This would typically be
4561  * 2 at most, since we can only fit 2 double elements in the result of a
4562  * vec4 load.
4563  *
4564  * Notice that @dst and @src can be the same register.
4565  */
4566 void
4567 shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
4568                                         const fs_reg &dst,
4569                                         const fs_reg &src,
4570                                         uint32_t components)
4571 {
4572    assert(type_sz(src.type) == 4);
4573    assert(type_sz(dst.type) == 8);
4574
4575    /* A temporary that we will use to shuffle the 32-bit data of each
4576     * component in the vector into valid 64-bit data. We can't write directly
4577     * to dst because dst can be (and would usually be) the same as src
4578     * and in that case the first MOV in the loop below would overwrite the
4579     * data read in the second MOV.
4580     */
4581    fs_reg tmp = bld.vgrf(dst.type);
4582
4583    for (unsigned i = 0; i < components; i++) {
4584       const fs_reg component_i = offset(src, bld, 2 * i);
4585
4586       bld.MOV(subscript(tmp, src.type, 0), component_i);
4587       bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
4588
4589       bld.MOV(offset(dst, bld, i), tmp);
4590    }
4591 }
4592
4593 /**
4594  * This helper does the inverse operation of
4595  * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
4596  *
4597  * We need to do this when we are going to use untyped write messsages that
4598  * operate with 32-bit components in order to arrange our 64-bit data to be
4599  * in the expected layout.
4600  *
4601  * Notice that callers of this function, unlike in the case of the inverse
4602  * operation, would typically need to call this with dst and src being
4603  * different registers, since they would otherwise corrupt the original
4604  * 64-bit data they are about to write. Because of this the function checks
4605  * that the src and dst regions involved in the operation do not overlap.
4606  */
4607 void
4608 shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
4609                                    const fs_reg &dst,
4610                                    const fs_reg &src,
4611                                    uint32_t components)
4612 {
4613    assert(type_sz(src.type) == 8);
4614    assert(type_sz(dst.type) == 4);
4615
4616    assert(!regions_overlap(
4617              dst, 2 * components * dst.component_size(bld.dispatch_width()),
4618              src, components * src.component_size(bld.dispatch_width())));
4619
4620    for (unsigned i = 0; i < components; i++) {
4621       const fs_reg component_i = offset(src, bld, i);
4622       bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
4623       bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
4624    }
4625 }
4626
4627 fs_reg
4628 setup_imm_df(const fs_builder &bld, double v)
4629 {
4630    const struct gen_device_info *devinfo = bld.shader->devinfo;
4631    assert(devinfo->gen >= 7);
4632
4633    if (devinfo->gen >= 8)
4634       return brw_imm_df(v);
4635
4636    /* gen7.5 does not support DF immediates straighforward but the DIM
4637     * instruction allows to set the 64-bit immediate value.
4638     */
4639    if (devinfo->is_haswell) {
4640       const fs_builder ubld = bld.exec_all().group(1, 0);
4641       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
4642       ubld.DIM(dst, brw_imm_df(v));
4643       return component(dst, 0);
4644    }
4645
4646    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
4647     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
4648     * the high 32-bit to suboffset 4 and then applying a stride of 0.
4649     *
4650     * Alternatively, we could also produce a normal VGRF (without stride 0)
4651     * by writing to all the channels in the VGRF, however, that would hit the
4652     * gen7 bug where we have to split writes that span more than 1 register
4653     * into instructions with a width of 4 (otherwise the write to the second
4654     * register written runs into an execmask hardware bug) which isn't very
4655     * nice.
4656     */
4657    union {
4658       double d;
4659       struct {
4660          uint32_t i1;
4661          uint32_t i2;
4662       };
4663    } di;
4664
4665    di.d = v;
4666
4667    const fs_builder ubld = bld.exec_all().group(1, 0);
4668    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4669    ubld.MOV(tmp, brw_imm_ud(di.i1));
4670    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
4671
4672    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
4673 }