src/mesa/drivers/dri/i965/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "glsl/ir.h"
  25 #include "glsl/ir_optimization.h"
  26 #include "glsl/nir/glsl_to_nir.h"
  27 #include "main/shaderimage.h"
  28 #include "program/prog_to_nir.h"
  29 #include "brw_fs.h"
  30 #include "brw_fs_surface_builder.h"
  31 #include "brw_vec4_gs_visitor.h"
  32 #include "brw_nir.h"
  33 #include "brw_program.h"
  34
  35 using namespace brw;
  36 using namespace brw::surface_access;
  37
  38 void
  39 fs_visitor::emit_nir_code()
  40 {
  41    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  42     * be converted to reads/writes of these arrays
  43     */
  44    nir_setup_inputs();
  45    nir_setup_outputs();
  46    nir_setup_uniforms();
  47    nir_emit_system_values();
  48
  49    /* get the main function and emit it */
  50    nir_foreach_overload(nir, overload) {
  51       assert(strcmp(overload->function->name, "main") == 0);
  52       assert(overload->impl);
  53       nir_emit_impl(overload->impl);
  54    }
  55 }
  56
  57 void
  58 fs_visitor::nir_setup_inputs()
  59 {
  60    if (stage != MESA_SHADER_FRAGMENT)
  61       return;
  62
  63    nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
  64
  65    nir_foreach_variable(var, &nir->inputs) {
  66       fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
  67
  68       fs_reg reg;
  69       if (var->data.location == VARYING_SLOT_POS) {
  70          reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
  71                                              var->data.origin_upper_left);
  72          emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
  73                                    input, reg), 0xF);
  74       } else if (var->data.location == VARYING_SLOT_LAYER) {
  75          struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER, 1), 3);
  76          reg.type = BRW_REGISTER_TYPE_D;
  77          bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
  78       } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
  79          struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT, 2), 3);
  80          reg.type = BRW_REGISTER_TYPE_D;
  81          bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
  82       } else {
  83          emit_general_interpolation(input, var->name, var->type,
  84                                     (glsl_interp_qualifier) var->data.interpolation,
  85                                     var->data.location, var->data.centroid,
  86                                     var->data.sample);
  87       }
  88    }
  89 }
  90
  91 void
  92 fs_visitor::nir_setup_outputs()
  93 {
  94    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  95
  96    nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
  97
  98    nir_foreach_variable(var, &nir->outputs) {
  99       fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
 100
 101       int vector_elements = var->type->without_array()->vector_elements;
 102
 103       switch (stage) {
 104       case MESA_SHADER_VERTEX:
 105       case MESA_SHADER_GEOMETRY:
 106          for (int i = 0; i < type_size_vec4(var->type); i++) {
 107             int output = var->data.location + i;
 108             this->outputs[output] = offset(reg, bld, 4 * i);
 109             this->output_components[output] = vector_elements;
 110          }
 111          break;
 112       case MESA_SHADER_FRAGMENT:
 113          if (var->data.index > 0) {
 114             assert(var->data.location == FRAG_RESULT_DATA0);
 115             assert(var->data.index == 1);
 116             this->dual_src_output = reg;
 117             this->do_dual_src = true;
 118          } else if (var->data.location == FRAG_RESULT_COLOR) {
 119             /* Writing gl_FragColor outputs to all color regions. */
 120             for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
 121                this->outputs[i] = reg;
 122                this->output_components[i] = 4;
 123             }
 124          } else if (var->data.location == FRAG_RESULT_DEPTH) {
 125             this->frag_depth = reg;
 126          } else if (var->data.location == FRAG_RESULT_STENCIL) {
 127             this->frag_stencil = reg;
 128          } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
 129             this->sample_mask = reg;
 130          } else {
 131             /* gl_FragData or a user-defined FS output */
 132             assert(var->data.location >= FRAG_RESULT_DATA0 &&
 133                    var->data.location < FRAG_RESULT_DATA0+BRW_MAX_DRAW_BUFFERS);
 134
 135             /* General color output. */
 136             for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
 137                int output = var->data.location - FRAG_RESULT_DATA0 + i;
 138                this->outputs[output] = offset(reg, bld, vector_elements * i);
 139                this->output_components[output] = vector_elements;
 140             }
 141          }
 142          break;
 143       default:
 144          unreachable("unhandled shader stage");
 145       }
 146    }
 147 }
 148
 149 void
 150 fs_visitor::nir_setup_uniforms()
 151 {
 152    if (dispatch_width != 8)
 153       return;
 154
 155    uniforms = nir->num_uniforms;
 156
 157    nir_foreach_variable(var, &nir->uniforms) {
 158       /* UBO's and atomics don't take up space in the uniform file */
 159       if (var->interface_type != NULL || var->type->contains_atomic())
 160          continue;
 161
 162       if (type_size_scalar(var->type) > 0)
 163          param_size[var->data.driver_location] = type_size_scalar(var->type);
 164    }
 165 }
 166
 167 static bool
 168 emit_system_values_block(nir_block *block, void *void_visitor)
 169 {
 170    fs_visitor *v = (fs_visitor *)void_visitor;
 171    fs_reg *reg;
 172
 173    nir_foreach_instr(block, instr) {
 174       if (instr->type != nir_instr_type_intrinsic)
 175          continue;
 176
 177       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 178       switch (intrin->intrinsic) {
 179       case nir_intrinsic_load_vertex_id:
 180          unreachable("should be lowered by lower_vertex_id().");
 181
 182       case nir_intrinsic_load_vertex_id_zero_base:
 183          assert(v->stage == MESA_SHADER_VERTEX);
 184          reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
 185          if (reg->file == BAD_FILE)
 186             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
 187          break;
 188
 189       case nir_intrinsic_load_base_vertex:
 190          assert(v->stage == MESA_SHADER_VERTEX);
 191          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
 192          if (reg->file == BAD_FILE)
 193             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
 194          break;
 195
 196       case nir_intrinsic_load_instance_id:
 197          assert(v->stage == MESA_SHADER_VERTEX);
 198          reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
 199          if (reg->file == BAD_FILE)
 200             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
 201          break;
 202
 203       case nir_intrinsic_load_invocation_id:
 204          assert(v->stage == MESA_SHADER_GEOMETRY);
 205          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 206          if (reg->file == BAD_FILE) {
 207             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 208             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 209             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 210             abld.SHR(iid, g1, brw_imm_ud(27u));
 211             *reg = iid;
 212          }
 213          break;
 214
 215       case nir_intrinsic_load_sample_pos:
 216          assert(v->stage == MESA_SHADER_FRAGMENT);
 217          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 218          if (reg->file == BAD_FILE)
 219             *reg = *v->emit_samplepos_setup();
 220          break;
 221
 222       case nir_intrinsic_load_sample_id:
 223          assert(v->stage == MESA_SHADER_FRAGMENT);
 224          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 225          if (reg->file == BAD_FILE)
 226             *reg = *v->emit_sampleid_setup();
 227          break;
 228
 229       case nir_intrinsic_load_sample_mask_in:
 230          assert(v->stage == MESA_SHADER_FRAGMENT);
 231          assert(v->devinfo->gen >= 7);
 232          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 233          if (reg->file == BAD_FILE)
 234             *reg = fs_reg(retype(brw_vec8_grf(v->payload.sample_mask_in_reg, 0),
 235                                  BRW_REGISTER_TYPE_D));
 236          break;
 237
 238       case nir_intrinsic_load_local_invocation_id:
 239          assert(v->stage == MESA_SHADER_COMPUTE);
 240          reg = &v->nir_system_values[SYSTEM_VALUE_LOCAL_INVOCATION_ID];
 241          if (reg->file == BAD_FILE)
 242             *reg = *v->emit_cs_local_invocation_id_setup();
 243          break;
 244
 245       case nir_intrinsic_load_work_group_id:
 246          assert(v->stage == MESA_SHADER_COMPUTE);
 247          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 248          if (reg->file == BAD_FILE)
 249             *reg = *v->emit_cs_work_group_id_setup();
 250          break;
 251
 252       case nir_intrinsic_load_helper_invocation:
 253          assert(v->stage == MESA_SHADER_FRAGMENT);
 254          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
 255          if (reg->file == BAD_FILE) {
 256             const fs_builder abld =
 257                v->bld.annotate("gl_HelperInvocation", NULL);
 258
 259             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
 260              * pixel mask is in g1.7 of the thread payload.
 261              *
 262              * We move the per-channel pixel enable bit to the low bit of each
 263              * channel by shifting the byte containing the pixel mask by the
 264              * vector immediate 0x76543210UV.
 265              *
 266              * The region of <1,8,0> reads only 1 byte (the pixel masks for
 267              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
 268              * masks for 2 and 3) in SIMD16.
 269              */
 270             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
 271             abld.SHR(shifted,
 272                      stride(byte_offset(retype(brw_vec1_grf(1, 0),
 273                                                BRW_REGISTER_TYPE_UB), 28),
 274                             1, 8, 0),
 275                      brw_imm_uv(0x76543210));
 276
 277             /* A set bit in the pixel mask means the channel is enabled, but
 278              * that is the opposite of gl_HelperInvocation so we need to invert
 279              * the mask.
 280              *
 281              * The negate source-modifier bit of logical instructions on Gen8+
 282              * performs 1's complement negation, so we can use that instead of
 283              * a NOT instruction.
 284              */
 285             fs_reg inverted = negate(shifted);
 286             if (v->devinfo->gen < 8) {
 287                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
 288                abld.NOT(inverted, shifted);
 289             }
 290
 291             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
 292              * with 1 and negating.
 293              */
 294             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 295             abld.AND(anded, inverted, brw_imm_uw(1));
 296
 297             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
 298             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
 299             *reg = dst;
 300          }
 301          break;
 302
 303       default:
 304          break;
 305       }
 306    }
 307
 308    return true;
 309 }
 310
 311 void
 312 fs_visitor::nir_emit_system_values()
 313 {
 314    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 315    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
 316       nir_system_values[i] = fs_reg();
 317    }
 318
 319    nir_foreach_overload(nir, overload) {
 320       assert(strcmp(overload->function->name, "main") == 0);
 321       assert(overload->impl);
 322       nir_foreach_block(overload->impl, emit_system_values_block, this);
 323    }
 324 }
 325
 326 void
 327 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 328 {
 329    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
 330    for (unsigned i = 0; i < impl->reg_alloc; i++) {
 331       nir_locals[i] = fs_reg();
 332    }
 333
 334    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 335       unsigned array_elems =
 336          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 337       unsigned size = array_elems * reg->num_components;
 338       nir_locals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
 339    }
 340
 341    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 342                              impl->ssa_alloc);
 343
 344    nir_emit_cf_list(&impl->body);
 345 }
 346
 347 void
 348 fs_visitor::nir_emit_cf_list(exec_list *list)
 349 {
 350    exec_list_validate(list);
 351    foreach_list_typed(nir_cf_node, node, node, list) {
 352       switch (node->type) {
 353       case nir_cf_node_if:
 354          nir_emit_if(nir_cf_node_as_if(node));
 355          break;
 356
 357       case nir_cf_node_loop:
 358          nir_emit_loop(nir_cf_node_as_loop(node));
 359          break;
 360
 361       case nir_cf_node_block:
 362          nir_emit_block(nir_cf_node_as_block(node));
 363          break;
 364
 365       default:
 366          unreachable("Invalid CFG node block");
 367       }
 368    }
 369 }
 370
 371 void
 372 fs_visitor::nir_emit_if(nir_if *if_stmt)
 373 {
 374    /* first, put the condition into f0 */
 375    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 376                             retype(get_nir_src(if_stmt->condition),
 377                                    BRW_REGISTER_TYPE_D));
 378    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 379
 380    bld.IF(BRW_PREDICATE_NORMAL);
 381
 382    nir_emit_cf_list(&if_stmt->then_list);
 383
 384    /* note: if the else is empty, dead CF elimination will remove it */
 385    bld.emit(BRW_OPCODE_ELSE);
 386
 387    nir_emit_cf_list(&if_stmt->else_list);
 388
 389    bld.emit(BRW_OPCODE_ENDIF);
 390 }
 391
 392 void
 393 fs_visitor::nir_emit_loop(nir_loop *loop)
 394 {
 395    bld.emit(BRW_OPCODE_DO);
 396
 397    nir_emit_cf_list(&loop->body);
 398
 399    bld.emit(BRW_OPCODE_WHILE);
 400 }
 401
 402 void
 403 fs_visitor::nir_emit_block(nir_block *block)
 404 {
 405    nir_foreach_instr(block, instr) {
 406       nir_emit_instr(instr);
 407    }
 408 }
 409
 410 void
 411 fs_visitor::nir_emit_instr(nir_instr *instr)
 412 {
 413    const fs_builder abld = bld.annotate(NULL, instr);
 414
 415    switch (instr->type) {
 416    case nir_instr_type_alu:
 417       nir_emit_alu(abld, nir_instr_as_alu(instr));
 418       break;
 419
 420    case nir_instr_type_intrinsic:
 421       switch (stage) {
 422       case MESA_SHADER_VERTEX:
 423          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 424          break;
 425       case MESA_SHADER_GEOMETRY:
 426          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 427          break;
 428       case MESA_SHADER_FRAGMENT:
 429          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 430          break;
 431       case MESA_SHADER_COMPUTE:
 432          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 433          break;
 434       default:
 435          unreachable("unsupported shader stage");
 436       }
 437       break;
 438
 439    case nir_instr_type_tex:
 440       nir_emit_texture(abld, nir_instr_as_tex(instr));
 441       break;
 442
 443    case nir_instr_type_load_const:
 444       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 445       break;
 446
 447    case nir_instr_type_ssa_undef:
 448       nir_emit_undef(abld, nir_instr_as_ssa_undef(instr));
 449       break;
 450
 451    case nir_instr_type_jump:
 452       nir_emit_jump(abld, nir_instr_as_jump(instr));
 453       break;
 454
 455    default:
 456       unreachable("unknown instruction type");
 457    }
 458 }
 459
 460 bool
 461 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 462                                          const fs_reg &result)
 463 {
 464    if (!instr->src[0].src.is_ssa ||
 465        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
 466       return false;
 467
 468    nir_intrinsic_instr *src0 =
 469       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 470
 471    if (src0->intrinsic != nir_intrinsic_load_front_face)
 472       return false;
 473
 474    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
 475    if (!value1 || fabsf(value1->f[0]) != 1.0f)
 476       return false;
 477
 478    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
 479    if (!value2 || fabsf(value2->f[0]) != 1.0f)
 480       return false;
 481
 482    fs_reg tmp = vgrf(glsl_type::int_type);
 483
 484    if (devinfo->gen >= 6) {
 485       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 486       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 487
 488       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 489        *
 490        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 491        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 492        *
 493        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 494        *
 495        * This negation looks like it's safe in practice, because bits 0:4 will
 496        * surely be TRIANGLES
 497        */
 498
 499       if (value1->f[0] == -1.0f) {
 500          g0.negate = true;
 501       }
 502
 503       tmp.type = BRW_REGISTER_TYPE_W;
 504       tmp.subreg_offset = 2;
 505       tmp.stride = 2;
 506
 507       bld.OR(tmp, g0, brw_imm_uw(0x3f80));
 508
 509       tmp.type = BRW_REGISTER_TYPE_D;
 510       tmp.subreg_offset = 0;
 511       tmp.stride = 1;
 512    } else {
 513       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 514       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 515
 516       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 517        *
 518        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 519        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 520        *
 521        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 522        *
 523        * This negation looks like it's safe in practice, because bits 0:4 will
 524        * surely be TRIANGLES
 525        */
 526
 527       if (value1->f[0] == -1.0f) {
 528          g1_6.negate = true;
 529       }
 530
 531       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
 532    }
 533    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 534
 535    return true;
 536 }
 537
 538 void
 539 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 540 {
 541    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 542    fs_inst *inst;
 543
 544    fs_reg result = get_nir_dest(instr->dest.dest);
 545    result.type = brw_type_for_nir_type(nir_op_infos[instr->op].output_type);
 546
 547    fs_reg op[4];
 548    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 549       op[i] = get_nir_src(instr->src[i].src);
 550       op[i].type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[i]);
 551       op[i].abs = instr->src[i].abs;
 552       op[i].negate = instr->src[i].negate;
 553    }
 554
 555    /* We get a bunch of mov's out of the from_ssa pass and they may still
 556     * be vectorized.  We'll handle them as a special-case.  We'll also
 557     * handle vecN here because it's basically the same thing.
 558     */
 559    switch (instr->op) {
 560    case nir_op_imov:
 561    case nir_op_fmov:
 562    case nir_op_vec2:
 563    case nir_op_vec3:
 564    case nir_op_vec4: {
 565       fs_reg temp = result;
 566       bool need_extra_copy = false;
 567       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 568          if (!instr->src[i].src.is_ssa &&
 569              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
 570             need_extra_copy = true;
 571             temp = bld.vgrf(result.type, 4);
 572             break;
 573          }
 574       }
 575
 576       for (unsigned i = 0; i < 4; i++) {
 577          if (!(instr->dest.write_mask & (1 << i)))
 578             continue;
 579
 580          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
 581             inst = bld.MOV(offset(temp, bld, i),
 582                            offset(op[0], bld, instr->src[0].swizzle[i]));
 583          } else {
 584             inst = bld.MOV(offset(temp, bld, i),
 585                            offset(op[i], bld, instr->src[i].swizzle[0]));
 586          }
 587          inst->saturate = instr->dest.saturate;
 588       }
 589
 590       /* In this case the source and destination registers were the same,
 591        * so we need to insert an extra set of moves in order to deal with
 592        * any swizzling.
 593        */
 594       if (need_extra_copy) {
 595          for (unsigned i = 0; i < 4; i++) {
 596             if (!(instr->dest.write_mask & (1 << i)))
 597                continue;
 598
 599             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
 600          }
 601       }
 602       return;
 603    }
 604    default:
 605       break;
 606    }
 607
 608    /* At this point, we have dealt with any instruction that operates on
 609     * more than a single channel.  Therefore, we can just adjust the source
 610     * and destination registers for that channel and emit the instruction.
 611     */
 612    unsigned channel = 0;
 613    if (nir_op_infos[instr->op].output_size == 0) {
 614       /* Since NIR is doing the scalarizing for us, we should only ever see
 615        * vectorized operations with a single channel.
 616        */
 617       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
 618       channel = ffs(instr->dest.write_mask) - 1;
 619
 620       result = offset(result, bld, channel);
 621    }
 622
 623    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 624       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 625       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 626    }
 627
 628    switch (instr->op) {
 629    case nir_op_i2f:
 630    case nir_op_u2f:
 631       inst = bld.MOV(result, op[0]);
 632       inst->saturate = instr->dest.saturate;
 633       break;
 634
 635    case nir_op_f2i:
 636    case nir_op_f2u:
 637       bld.MOV(result, op[0]);
 638       break;
 639
 640    case nir_op_fsign: {
 641       /* AND(val, 0x80000000) gives the sign bit.
 642          *
 643          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 644          * zero.
 645          */
 646       bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 647
 648       fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 649       op[0].type = BRW_REGISTER_TYPE_UD;
 650       result.type = BRW_REGISTER_TYPE_UD;
 651       bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
 652
 653       inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
 654       inst->predicate = BRW_PREDICATE_NORMAL;
 655       if (instr->dest.saturate) {
 656          inst = bld.MOV(result, result);
 657          inst->saturate = true;
 658       }
 659       break;
 660    }
 661
 662    case nir_op_isign:
 663       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 664        *               -> non-negative val generates 0x00000000.
 665        *  Predicated OR sets 1 if val is positive.
 666        */
 667       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
 668       bld.ASR(result, op[0], brw_imm_d(31));
 669       inst = bld.OR(result, result, brw_imm_d(1));
 670       inst->predicate = BRW_PREDICATE_NORMAL;
 671       break;
 672
 673    case nir_op_frcp:
 674       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
 675       inst->saturate = instr->dest.saturate;
 676       break;
 677
 678    case nir_op_fexp2:
 679       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
 680       inst->saturate = instr->dest.saturate;
 681       break;
 682
 683    case nir_op_flog2:
 684       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
 685       inst->saturate = instr->dest.saturate;
 686       break;
 687
 688    case nir_op_fsin:
 689       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
 690       inst->saturate = instr->dest.saturate;
 691       break;
 692
 693    case nir_op_fcos:
 694       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
 695       inst->saturate = instr->dest.saturate;
 696       break;
 697
 698    case nir_op_fddx:
 699       if (fs_key->high_quality_derivatives) {
 700          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 701       } else {
 702          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 703       }
 704       inst->saturate = instr->dest.saturate;
 705       break;
 706    case nir_op_fddx_fine:
 707       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 708       inst->saturate = instr->dest.saturate;
 709       break;
 710    case nir_op_fddx_coarse:
 711       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 712       inst->saturate = instr->dest.saturate;
 713       break;
 714    case nir_op_fddy:
 715       if (fs_key->high_quality_derivatives) {
 716          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
 717                          brw_imm_d(fs_key->render_to_fbo));
 718       } else {
 719          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
 720                          brw_imm_d(fs_key->render_to_fbo));
 721       }
 722       inst->saturate = instr->dest.saturate;
 723       break;
 724    case nir_op_fddy_fine:
 725       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
 726                       brw_imm_d(fs_key->render_to_fbo));
 727       inst->saturate = instr->dest.saturate;
 728       break;
 729    case nir_op_fddy_coarse:
 730       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
 731                       brw_imm_d(fs_key->render_to_fbo));
 732       inst->saturate = instr->dest.saturate;
 733       break;
 734
 735    case nir_op_fadd:
 736    case nir_op_iadd:
 737       inst = bld.ADD(result, op[0], op[1]);
 738       inst->saturate = instr->dest.saturate;
 739       break;
 740
 741    case nir_op_fmul:
 742       inst = bld.MUL(result, op[0], op[1]);
 743       inst->saturate = instr->dest.saturate;
 744       break;
 745
 746    case nir_op_imul:
 747       bld.MUL(result, op[0], op[1]);
 748       break;
 749
 750    case nir_op_imul_high:
 751    case nir_op_umul_high:
 752       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
 753       break;
 754
 755    case nir_op_idiv:
 756    case nir_op_udiv:
 757       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
 758       break;
 759
 760    case nir_op_uadd_carry:
 761       unreachable("Should have been lowered by carry_to_arith().");
 762
 763    case nir_op_usub_borrow:
 764       unreachable("Should have been lowered by borrow_to_arith().");
 765
 766    case nir_op_umod:
 767       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
 768       break;
 769
 770    case nir_op_flt:
 771    case nir_op_ilt:
 772    case nir_op_ult:
 773       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
 774       break;
 775
 776    case nir_op_fge:
 777    case nir_op_ige:
 778    case nir_op_uge:
 779       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE);
 780       break;
 781
 782    case nir_op_feq:
 783    case nir_op_ieq:
 784       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z);
 785       break;
 786
 787    case nir_op_fne:
 788    case nir_op_ine:
 789       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ);
 790       break;
 791
 792    case nir_op_inot:
 793       if (devinfo->gen >= 8) {
 794          op[0] = resolve_source_modifiers(op[0]);
 795       }
 796       bld.NOT(result, op[0]);
 797       break;
 798    case nir_op_ixor:
 799       if (devinfo->gen >= 8) {
 800          op[0] = resolve_source_modifiers(op[0]);
 801          op[1] = resolve_source_modifiers(op[1]);
 802       }
 803       bld.XOR(result, op[0], op[1]);
 804       break;
 805    case nir_op_ior:
 806       if (devinfo->gen >= 8) {
 807          op[0] = resolve_source_modifiers(op[0]);
 808          op[1] = resolve_source_modifiers(op[1]);
 809       }
 810       bld.OR(result, op[0], op[1]);
 811       break;
 812    case nir_op_iand:
 813       if (devinfo->gen >= 8) {
 814          op[0] = resolve_source_modifiers(op[0]);
 815          op[1] = resolve_source_modifiers(op[1]);
 816       }
 817       bld.AND(result, op[0], op[1]);
 818       break;
 819
 820    case nir_op_fdot2:
 821    case nir_op_fdot3:
 822    case nir_op_fdot4:
 823    case nir_op_bany2:
 824    case nir_op_bany3:
 825    case nir_op_bany4:
 826    case nir_op_ball2:
 827    case nir_op_ball3:
 828    case nir_op_ball4:
 829    case nir_op_ball_fequal2:
 830    case nir_op_ball_iequal2:
 831    case nir_op_ball_fequal3:
 832    case nir_op_ball_iequal3:
 833    case nir_op_ball_fequal4:
 834    case nir_op_ball_iequal4:
 835    case nir_op_bany_fnequal2:
 836    case nir_op_bany_inequal2:
 837    case nir_op_bany_fnequal3:
 838    case nir_op_bany_inequal3:
 839    case nir_op_bany_fnequal4:
 840    case nir_op_bany_inequal4:
 841       unreachable("Lowered by nir_lower_alu_reductions");
 842
 843    case nir_op_fnoise1_1:
 844    case nir_op_fnoise1_2:
 845    case nir_op_fnoise1_3:
 846    case nir_op_fnoise1_4:
 847    case nir_op_fnoise2_1:
 848    case nir_op_fnoise2_2:
 849    case nir_op_fnoise2_3:
 850    case nir_op_fnoise2_4:
 851    case nir_op_fnoise3_1:
 852    case nir_op_fnoise3_2:
 853    case nir_op_fnoise3_3:
 854    case nir_op_fnoise3_4:
 855    case nir_op_fnoise4_1:
 856    case nir_op_fnoise4_2:
 857    case nir_op_fnoise4_3:
 858    case nir_op_fnoise4_4:
 859       unreachable("not reached: should be handled by lower_noise");
 860
 861    case nir_op_ldexp:
 862       unreachable("not reached: should be handled by ldexp_to_arith()");
 863
 864    case nir_op_fsqrt:
 865       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
 866       inst->saturate = instr->dest.saturate;
 867       break;
 868
 869    case nir_op_frsq:
 870       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
 871       inst->saturate = instr->dest.saturate;
 872       break;
 873
 874    case nir_op_b2i:
 875    case nir_op_b2f:
 876       bld.MOV(result, negate(op[0]));
 877       break;
 878
 879    case nir_op_f2b:
 880       bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 881       break;
 882    case nir_op_i2b:
 883       bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
 884       break;
 885
 886    case nir_op_ftrunc:
 887       inst = bld.RNDZ(result, op[0]);
 888       inst->saturate = instr->dest.saturate;
 889       break;
 890
 891    case nir_op_fceil: {
 892       op[0].negate = !op[0].negate;
 893       fs_reg temp = vgrf(glsl_type::float_type);
 894       bld.RNDD(temp, op[0]);
 895       temp.negate = true;
 896       inst = bld.MOV(result, temp);
 897       inst->saturate = instr->dest.saturate;
 898       break;
 899    }
 900    case nir_op_ffloor:
 901       inst = bld.RNDD(result, op[0]);
 902       inst->saturate = instr->dest.saturate;
 903       break;
 904    case nir_op_ffract:
 905       inst = bld.FRC(result, op[0]);
 906       inst->saturate = instr->dest.saturate;
 907       break;
 908    case nir_op_fround_even:
 909       inst = bld.RNDE(result, op[0]);
 910       inst->saturate = instr->dest.saturate;
 911       break;
 912
 913    case nir_op_fmin:
 914    case nir_op_imin:
 915    case nir_op_umin:
 916       if (devinfo->gen >= 6) {
 917          inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
 918          inst->conditional_mod = BRW_CONDITIONAL_L;
 919       } else {
 920          bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_L);
 921          inst = bld.SEL(result, op[0], op[1]);
 922          inst->predicate = BRW_PREDICATE_NORMAL;
 923       }
 924       inst->saturate = instr->dest.saturate;
 925       break;
 926
 927    case nir_op_fmax:
 928    case nir_op_imax:
 929    case nir_op_umax:
 930       if (devinfo->gen >= 6) {
 931          inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
 932          inst->conditional_mod = BRW_CONDITIONAL_GE;
 933       } else {
 934          bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_GE);
 935          inst = bld.SEL(result, op[0], op[1]);
 936          inst->predicate = BRW_PREDICATE_NORMAL;
 937       }
 938       inst->saturate = instr->dest.saturate;
 939       break;
 940
 941    case nir_op_pack_snorm_2x16:
 942    case nir_op_pack_snorm_4x8:
 943    case nir_op_pack_unorm_2x16:
 944    case nir_op_pack_unorm_4x8:
 945    case nir_op_unpack_snorm_2x16:
 946    case nir_op_unpack_snorm_4x8:
 947    case nir_op_unpack_unorm_2x16:
 948    case nir_op_unpack_unorm_4x8:
 949    case nir_op_unpack_half_2x16:
 950    case nir_op_pack_half_2x16:
 951       unreachable("not reached: should be handled by lower_packing_builtins");
 952
 953    case nir_op_unpack_half_2x16_split_x:
 954       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
 955       inst->saturate = instr->dest.saturate;
 956       break;
 957    case nir_op_unpack_half_2x16_split_y:
 958       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
 959       inst->saturate = instr->dest.saturate;
 960       break;
 961
 962    case nir_op_fpow:
 963       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
 964       inst->saturate = instr->dest.saturate;
 965       break;
 966
 967    case nir_op_bitfield_reverse:
 968       bld.BFREV(result, op[0]);
 969       break;
 970
 971    case nir_op_bit_count:
 972       bld.CBIT(result, op[0]);
 973       break;
 974
 975    case nir_op_ufind_msb:
 976    case nir_op_ifind_msb: {
 977       bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
 978
 979       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
 980        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
 981        * subtract the result from 31 to convert the MSB count into an LSB count.
 982        */
 983       bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
 984
 985       inst = bld.ADD(result, result, brw_imm_d(31));
 986       inst->predicate = BRW_PREDICATE_NORMAL;
 987       inst->src[0].negate = true;
 988       break;
 989    }
 990
 991    case nir_op_find_lsb:
 992       bld.FBL(result, op[0]);
 993       break;
 994
 995    case nir_op_ubitfield_extract:
 996    case nir_op_ibitfield_extract:
 997       bld.BFE(result, op[2], op[1], op[0]);
 998       break;
 999    case nir_op_bfm:
1000       bld.BFI1(result, op[0], op[1]);
1001       break;
1002    case nir_op_bfi:
1003       bld.BFI2(result, op[0], op[1], op[2]);
1004       break;
1005
1006    case nir_op_bitfield_insert:
1007       unreachable("not reached: should be handled by "
1008                   "lower_instructions::bitfield_insert_to_bfm_bfi");
1009
1010    case nir_op_ishl:
1011       bld.SHL(result, op[0], op[1]);
1012       break;
1013    case nir_op_ishr:
1014       bld.ASR(result, op[0], op[1]);
1015       break;
1016    case nir_op_ushr:
1017       bld.SHR(result, op[0], op[1]);
1018       break;
1019
1020    case nir_op_pack_half_2x16_split:
1021       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1022       break;
1023
1024    case nir_op_ffma:
1025       inst = bld.MAD(result, op[2], op[1], op[0]);
1026       inst->saturate = instr->dest.saturate;
1027       break;
1028
1029    case nir_op_flrp:
1030       inst = bld.LRP(result, op[0], op[1], op[2]);
1031       inst->saturate = instr->dest.saturate;
1032       break;
1033
1034    case nir_op_bcsel:
1035       if (optimize_frontfacing_ternary(instr, result))
1036          return;
1037
1038       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1039       inst = bld.SEL(result, op[1], op[2]);
1040       inst->predicate = BRW_PREDICATE_NORMAL;
1041       break;
1042
1043    default:
1044       unreachable("unhandled instruction");
1045    }
1046
1047    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1048     * to sign extend the low bit to 0/~0
1049     */
1050    if (devinfo->gen <= 5 &&
1051        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1052       fs_reg masked = vgrf(glsl_type::int_type);
1053       bld.AND(masked, result, brw_imm_d(1));
1054       masked.negate = true;
1055       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1056    }
1057 }
1058
1059 void
1060 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1061                                 nir_load_const_instr *instr)
1062 {
1063    fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
1064
1065    for (unsigned i = 0; i < instr->def.num_components; i++)
1066       bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i[i]));
1067
1068    nir_ssa_values[instr->def.index] = reg;
1069 }
1070
1071 void
1072 fs_visitor::nir_emit_undef(const fs_builder &bld, nir_ssa_undef_instr *instr)
1073 {
1074    nir_ssa_values[instr->def.index] = bld.vgrf(BRW_REGISTER_TYPE_D,
1075                                                instr->def.num_components);
1076 }
1077
1078 static fs_reg
1079 fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
1080                    unsigned base_offset, nir_src *indirect)
1081 {
1082    fs_reg reg;
1083
1084    assert(!nir_reg->is_global);
1085
1086    reg = v->nir_locals[nir_reg->index];
1087
1088    reg = offset(reg, v->bld, base_offset * nir_reg->num_components);
1089    if (indirect) {
1090       int multiplier = nir_reg->num_components * (v->dispatch_width / 8);
1091
1092       reg.reladdr = new(v->mem_ctx) fs_reg(v->vgrf(glsl_type::int_type));
1093       v->bld.MUL(*reg.reladdr, v->get_nir_src(*indirect),
1094                  brw_imm_d(multiplier));
1095    }
1096
1097    return reg;
1098 }
1099
1100 fs_reg
1101 fs_visitor::get_nir_src(nir_src src)
1102 {
1103    fs_reg reg;
1104    if (src.is_ssa) {
1105       reg = nir_ssa_values[src.ssa->index];
1106    } else {
1107       reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
1108                                src.reg.indirect);
1109    }
1110
1111    /* to avoid floating-point denorm flushing problems, set the type by
1112     * default to D - instructions that need floating point semantics will set
1113     * this to F if they need to
1114     */
1115    return retype(reg, BRW_REGISTER_TYPE_D);
1116 }
1117
1118 fs_reg
1119 fs_visitor::get_nir_dest(nir_dest dest)
1120 {
1121    if (dest.is_ssa) {
1122       nir_ssa_values[dest.ssa.index] = bld.vgrf(BRW_REGISTER_TYPE_F,
1123                                                 dest.ssa.num_components);
1124       return nir_ssa_values[dest.ssa.index];
1125    }
1126
1127    return fs_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
1128                              dest.reg.indirect);
1129 }
1130
1131 fs_reg
1132 fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
1133 {
1134    fs_reg image(UNIFORM, deref->var->data.driver_location,
1135                 BRW_REGISTER_TYPE_UD);
1136
1137    for (const nir_deref *tail = &deref->deref; tail->child;
1138         tail = tail->child) {
1139       const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
1140       assert(tail->child->deref_type == nir_deref_type_array);
1141       const unsigned size = glsl_get_length(tail->type);
1142       const unsigned element_size = type_size_scalar(deref_array->deref.type);
1143       const unsigned base = MIN2(deref_array->base_offset, size - 1);
1144       image = offset(image, bld, base * element_size);
1145
1146       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
1147          fs_reg tmp = vgrf(glsl_type::int_type);
1148
1149          if (devinfo->gen == 7 && !devinfo->is_haswell) {
1150             /* IVB hangs when trying to access an invalid surface index with
1151              * the dataport.  According to the spec "if the index used to
1152              * select an individual element is negative or greater than or
1153              * equal to the size of the array, the results of the operation
1154              * are undefined but may not lead to termination" -- which is one
1155              * of the possible outcomes of the hang.  Clamp the index to
1156              * prevent access outside of the array bounds.
1157              */
1158             bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
1159                                         BRW_REGISTER_TYPE_UD),
1160                             brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
1161          } else {
1162             bld.MOV(tmp, get_nir_src(deref_array->indirect));
1163          }
1164
1165          bld.MUL(tmp, tmp, brw_imm_ud(element_size));
1166          if (image.reladdr)
1167             bld.ADD(*image.reladdr, *image.reladdr, tmp);
1168          else
1169             image.reladdr = new(mem_ctx) fs_reg(tmp);
1170       }
1171    }
1172
1173    return image;
1174 }
1175
1176 void
1177 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1178                          unsigned wr_mask)
1179 {
1180    for (unsigned i = 0; i < 4; i++) {
1181       if (!((wr_mask >> i) & 1))
1182          continue;
1183
1184       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1185       new_inst->dst = offset(new_inst->dst, bld, i);
1186       for (unsigned j = 0; j < new_inst->sources; j++)
1187          if (new_inst->src[j].file == VGRF)
1188             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1189
1190       bld.emit(new_inst);
1191    }
1192 }
1193
1194 /**
1195  * Get the matching channel register datatype for an image intrinsic of the
1196  * specified GLSL image type.
1197  */
1198 static brw_reg_type
1199 get_image_base_type(const glsl_type *type)
1200 {
1201    switch ((glsl_base_type)type->sampler_type) {
1202    case GLSL_TYPE_UINT:
1203       return BRW_REGISTER_TYPE_UD;
1204    case GLSL_TYPE_INT:
1205       return BRW_REGISTER_TYPE_D;
1206    case GLSL_TYPE_FLOAT:
1207       return BRW_REGISTER_TYPE_F;
1208    default:
1209       unreachable("Not reached.");
1210    }
1211 }
1212
1213 /**
1214  * Get the appropriate atomic op for an image atomic intrinsic.
1215  */
1216 static unsigned
1217 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
1218 {
1219    switch (op) {
1220    case nir_intrinsic_image_atomic_add:
1221       return BRW_AOP_ADD;
1222    case nir_intrinsic_image_atomic_min:
1223       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1224               BRW_AOP_IMIN : BRW_AOP_UMIN);
1225    case nir_intrinsic_image_atomic_max:
1226       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1227               BRW_AOP_IMAX : BRW_AOP_UMAX);
1228    case nir_intrinsic_image_atomic_and:
1229       return BRW_AOP_AND;
1230    case nir_intrinsic_image_atomic_or:
1231       return BRW_AOP_OR;
1232    case nir_intrinsic_image_atomic_xor:
1233       return BRW_AOP_XOR;
1234    case nir_intrinsic_image_atomic_exchange:
1235       return BRW_AOP_MOV;
1236    case nir_intrinsic_image_atomic_comp_swap:
1237       return BRW_AOP_CMPWR;
1238    default:
1239       unreachable("Not reachable.");
1240    }
1241 }
1242
1243 static fs_inst *
1244 emit_pixel_interpolater_send(const fs_builder &bld,
1245                              enum opcode opcode,
1246                              const fs_reg &dst,
1247                              const fs_reg &src,
1248                              const fs_reg &desc,
1249                              glsl_interp_qualifier interpolation)
1250 {
1251    fs_inst *inst;
1252    fs_reg payload;
1253    int mlen;
1254
1255    if (src.file == BAD_FILE) {
1256       /* Dummy payload */
1257       payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
1258       mlen = 1;
1259    } else {
1260       payload = src;
1261       mlen = 2 * bld.dispatch_width() / 8;
1262    }
1263
1264    inst = bld.emit(opcode, dst, payload, desc);
1265    inst->mlen = mlen;
1266    /* 2 floats per slot returned */
1267    inst->regs_written = 2 * bld.dispatch_width() / 8;
1268    inst->pi_noperspective = interpolation == INTERP_QUALIFIER_NOPERSPECTIVE;
1269
1270    return inst;
1271 }
1272
1273 /**
1274  * Computes 1 << x, given a D/UD register containing some value x.
1275  */
1276 static fs_reg
1277 intexp2(const fs_builder &bld, const fs_reg &x)
1278 {
1279    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1280
1281    fs_reg result = bld.vgrf(x.type, 1);
1282    fs_reg one = bld.vgrf(x.type, 1);
1283
1284    bld.MOV(one, retype(brw_imm_d(1), one.type));
1285    bld.SHL(result, one, x);
1286    return result;
1287 }
1288
1289 void
1290 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1291 {
1292    assert(stage == MESA_SHADER_GEOMETRY);
1293
1294    struct brw_gs_prog_data *gs_prog_data =
1295       (struct brw_gs_prog_data *) prog_data;
1296
1297    /* We can only do EndPrimitive() functionality when the control data
1298     * consists of cut bits.  Fortunately, the only time it isn't is when the
1299     * output type is points, in which case EndPrimitive() is a no-op.
1300     */
1301    if (gs_prog_data->control_data_format !=
1302        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1303       return;
1304    }
1305
1306    /* Cut bits use one bit per vertex. */
1307    assert(gs_compile->control_data_bits_per_vertex == 1);
1308
1309    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1310    vertex_count.type = BRW_REGISTER_TYPE_UD;
1311
1312    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1313     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1314     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1315     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1316     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1317     *
1318     * Note that if EndPrimitive() is called before emitting any vertices, this
1319     * will cause us to set bit 31 of the control_data_bits register to 1.
1320     * That's fine because:
1321     *
1322     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1323     *   output, so the hardware will ignore cut bit 31.
1324     *
1325     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1326     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1327     *   is automatically ended when the GS terminates).
1328     *
1329     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1330     *   control_data_bits register to 0 when the first vertex is emitted.
1331     */
1332
1333    const fs_builder abld = bld.annotate("end primitive");
1334
1335    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1336    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1337    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1338    fs_reg mask = intexp2(abld, prev_count);
1339    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1340     * attention to the lower 5 bits of its second source argument, so on this
1341     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1342     * ((vertex_count - 1) % 32).
1343     */
1344    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1345 }
1346
1347 void
1348 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1349 {
1350    assert(stage == MESA_SHADER_GEOMETRY);
1351    assert(gs_compile->control_data_bits_per_vertex != 0);
1352
1353    struct brw_gs_prog_data *gs_prog_data =
1354       (struct brw_gs_prog_data *) prog_data;
1355
1356    const fs_builder abld = bld.annotate("emit control data bits");
1357    const fs_builder fwa_bld = bld.exec_all();
1358
1359    /* We use a single UD register to accumulate control data bits (32 bits
1360     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1361     * at a time.
1362     *
1363     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1364     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1365     * use the Channel Mask phase to enable/disable which DWord within that
1366     * group to write.  (Remember, different SIMD8 channels may have emitted
1367     * different numbers of vertices, so we may need per-slot offsets.)
1368     *
1369     * Channel masking presents an annoying problem: we may have to replicate
1370     * the data up to 4 times:
1371     *
1372     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1373     *
1374     * To avoid penalizing shaders that emit a small number of vertices, we
1375     * can avoid these sometimes: if the size of the control data header is
1376     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1377     * land in the same 128-bit group, so we can skip per-slot offsets.
1378     *
1379     * Similarly, if the control data header is <= 32 bits, there is only one
1380     * DWord, so we can skip channel masks.
1381     */
1382    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1383
1384    fs_reg channel_mask, per_slot_offset;
1385
1386    if (gs_compile->control_data_header_size_bits > 32) {
1387       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1388       channel_mask = vgrf(glsl_type::uint_type);
1389    }
1390
1391    if (gs_compile->control_data_header_size_bits > 128) {
1392       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1393       per_slot_offset = vgrf(glsl_type::uint_type);
1394    }
1395
1396    /* Figure out which DWord we're trying to write to using the formula:
1397     *
1398     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1399     *
1400     * Since bits_per_vertex is a power of two, and is known at compile
1401     * time, this can be optimized to:
1402     *
1403     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1404     */
1405    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1406       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1407       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1408       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1409       unsigned log2_bits_per_vertex =
1410          _mesa_fls(gs_compile->control_data_bits_per_vertex);
1411       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1412
1413       if (per_slot_offset.file != BAD_FILE) {
1414          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1415           * the appropriate OWord within the control data header.
1416           */
1417          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1418       }
1419
1420       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1421        * write to the appropriate DWORD within the OWORD.
1422        */
1423       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1424       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1425       channel_mask = intexp2(fwa_bld, channel);
1426       /* Then the channel masks need to be in bits 23:16. */
1427       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1428    }
1429
1430    /* Store the control data bits in the message payload and send it. */
1431    int mlen = 2;
1432    if (channel_mask.file != BAD_FILE)
1433       mlen += 4; /* channel masks, plus 3 extra copies of the data */
1434    if (per_slot_offset.file != BAD_FILE)
1435       mlen++;
1436
1437    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1438    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1439    int i = 0;
1440    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1441    if (per_slot_offset.file != BAD_FILE)
1442       sources[i++] = per_slot_offset;
1443    if (channel_mask.file != BAD_FILE)
1444       sources[i++] = channel_mask;
1445    while (i < mlen) {
1446       sources[i++] = this->control_data_bits;
1447    }
1448
1449    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1450    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1451    inst->mlen = mlen;
1452    /* We need to increment Global Offset by 256-bits to make room for
1453     * Broadwell's extra "Vertex Count" payload at the beginning of the
1454     * URB entry.  Since this is an OWord message, Global Offset is counted
1455     * in 128-bit units, so we must set it to 2.
1456     */
1457    if (gs_prog_data->static_vertex_count == -1)
1458       inst->offset = 2;
1459 }
1460
1461 void
1462 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1463                                             unsigned stream_id)
1464 {
1465    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1466
1467    /* Note: we are calling this *before* increasing vertex_count, so
1468     * this->vertex_count == vertex_count - 1 in the formula above.
1469     */
1470
1471    /* Stream mode uses 2 bits per vertex */
1472    assert(gs_compile->control_data_bits_per_vertex == 2);
1473
1474    /* Must be a valid stream */
1475    assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
1476
1477    /* Control data bits are initialized to 0 so we don't have to set any
1478     * bits when sending vertices to stream 0.
1479     */
1480    if (stream_id == 0)
1481       return;
1482
1483    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1484
1485    /* reg::sid = stream_id */
1486    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1487    abld.MOV(sid, brw_imm_ud(stream_id));
1488
1489    /* reg:shift_count = 2 * (vertex_count - 1) */
1490    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1491    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
1492
1493    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1494     * attention to the lower 5 bits of its second source argument, so on this
1495     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1496     * stream_id << ((2 * (vertex_count - 1)) % 32).
1497     */
1498    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1499    abld.SHL(mask, sid, shift_count);
1500    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1501 }
1502
1503 void
1504 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1505                            unsigned stream_id)
1506 {
1507    assert(stage == MESA_SHADER_GEOMETRY);
1508
1509    struct brw_gs_prog_data *gs_prog_data =
1510       (struct brw_gs_prog_data *) prog_data;
1511
1512    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1513    vertex_count.type = BRW_REGISTER_TYPE_UD;
1514
1515    /* Haswell and later hardware ignores the "Render Stream Select" bits
1516     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1517     * and instead sends all primitives down the pipeline for rasterization.
1518     * If the SOL stage is enabled, "Render Stream Select" is honored and
1519     * primitives bound to non-zero streams are discarded after stream output.
1520     *
1521     * Since the only purpose of primives sent to non-zero streams is to
1522     * be recorded by transform feedback, we can simply discard all geometry
1523     * bound to these streams when transform feedback is disabled.
1524     */
1525    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
1526       return;
1527
1528    /* If we're outputting 32 control data bits or less, then we can wait
1529     * until the shader is over to output them all.  Otherwise we need to
1530     * output them as we go.  Now is the time to do it, since we're about to
1531     * output the vertex_count'th vertex, so it's guaranteed that the
1532     * control data bits associated with the (vertex_count - 1)th vertex are
1533     * correct.
1534     */
1535    if (gs_compile->control_data_header_size_bits > 32) {
1536       const fs_builder abld =
1537          bld.annotate("emit vertex: emit control data bits");
1538
1539       /* Only emit control data bits if we've finished accumulating a batch
1540        * of 32 bits.  This is the case when:
1541        *
1542        *     (vertex_count * bits_per_vertex) % 32 == 0
1543        *
1544        * (in other words, when the last 5 bits of vertex_count *
1545        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
1546        * integer n (which is always the case, since bits_per_vertex is
1547        * always 1 or 2), this is equivalent to requiring that the last 5-n
1548        * bits of vertex_count are 0:
1549        *
1550        *     vertex_count & (2^(5-n) - 1) == 0
1551        *
1552        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
1553        * equivalent to:
1554        *
1555        *     vertex_count & (32 / bits_per_vertex - 1) == 0
1556        *
1557        * TODO: If vertex_count is an immediate, we could do some of this math
1558        *       at compile time...
1559        */
1560       fs_inst *inst =
1561          abld.AND(bld.null_reg_d(), vertex_count,
1562                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
1563       inst->conditional_mod = BRW_CONDITIONAL_Z;
1564
1565       abld.IF(BRW_PREDICATE_NORMAL);
1566       /* If vertex_count is 0, then no control data bits have been
1567        * accumulated yet, so we can skip emitting them.
1568        */
1569       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
1570                BRW_CONDITIONAL_NEQ);
1571       abld.IF(BRW_PREDICATE_NORMAL);
1572       emit_gs_control_data_bits(vertex_count);
1573       abld.emit(BRW_OPCODE_ENDIF);
1574
1575       /* Reset control_data_bits to 0 so we can start accumulating a new
1576        * batch.
1577        *
1578        * Note: in the case where vertex_count == 0, this neutralizes the
1579        * effect of any call to EndPrimitive() that the shader may have
1580        * made before outputting its first vertex.
1581        */
1582       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
1583       inst->force_writemask_all = true;
1584       abld.emit(BRW_OPCODE_ENDIF);
1585    }
1586
1587    emit_urb_writes(vertex_count);
1588
1589    /* In stream mode we have to set control data bits for all vertices
1590     * unless we have disabled control data bits completely (which we do
1591     * do for GL_POINTS outputs that don't use streams).
1592     */
1593    if (gs_compile->control_data_header_size_bits > 0 &&
1594        gs_prog_data->control_data_format ==
1595           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
1596       set_gs_stream_control_data_bits(vertex_count, stream_id);
1597    }
1598 }
1599
1600 void
1601 fs_visitor::emit_gs_input_load(const fs_reg &dst,
1602                                const nir_src &vertex_src,
1603                                const fs_reg &indirect_offset,
1604                                unsigned imm_offset,
1605                                unsigned num_components)
1606 {
1607    struct brw_gs_prog_data *gs_prog_data = (struct brw_gs_prog_data *) prog_data;
1608
1609    /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
1610     * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
1611     * gl_PointSize is available as a GS input, however, so it must be that.
1612     */
1613    const bool is_point_size =
1614       indirect_offset.file == BAD_FILE && imm_offset == 0;
1615
1616    nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
1617    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
1618
1619    if (indirect_offset.file == BAD_FILE && vertex_const != NULL &&
1620        4 * imm_offset < push_reg_count) {
1621       imm_offset = 4 * imm_offset + vertex_const->u[0] * push_reg_count;
1622       /* This input was pushed into registers. */
1623       if (is_point_size) {
1624          /* gl_PointSize comes in .w */
1625          bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
1626       } else {
1627          for (unsigned i = 0; i < num_components; i++) {
1628             bld.MOV(offset(dst, bld, i),
1629                     fs_reg(ATTR, imm_offset + i, dst.type));
1630          }
1631       }
1632    } else {
1633       /* Resort to the pull model.  Ensure the VUE handles are provided. */
1634       gs_prog_data->base.include_vue_handles = true;
1635
1636       unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
1637       fs_reg icp_handle;
1638
1639       if (vertex_const) {
1640          /* The vertex index is constant; just select the proper URB handle. */
1641          icp_handle =
1642             retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0),
1643                    BRW_REGISTER_TYPE_UD);
1644       } else {
1645          /* The vertex index is non-constant.  We need to use indirect
1646           * addressing to fetch the proper URB handle.
1647           *
1648           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
1649           * indicating that channel <n> should read the handle from
1650           * DWord <n>.  We convert that to bytes by multiplying by 4.
1651           *
1652           * Next, we convert the vertex index to bytes by multiplying
1653           * by 32 (shifting by 5), and add the two together.  This is
1654           * the final indirect byte offset.
1655           */
1656          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
1657          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1658          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1659          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1660          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1661
1662          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
1663          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
1664          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
1665          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
1666          /* Convert vertex_index to bytes (multiply by 32) */
1667          bld.SHL(vertex_offset_bytes,
1668                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
1669                  brw_imm_ud(5u));
1670          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
1671
1672          /* Use first_icp_handle as the base offset.  There is one register
1673           * of URB handles per vertex, so inform the register allocator that
1674           * we might read up to nir->info.gs.vertices_in registers.
1675           */
1676          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
1677                   fs_reg(brw_vec8_grf(first_icp_handle, 0)),
1678                   fs_reg(icp_offset_bytes),
1679                   brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
1680       }
1681
1682       fs_inst *inst;
1683       if (indirect_offset.file == BAD_FILE) {
1684          /* Constant indexing - use global offset. */
1685          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
1686          inst->offset = imm_offset;
1687          inst->base_mrf = -1;
1688          inst->mlen = 1;
1689          inst->regs_written = num_components;
1690       } else {
1691          /* Indirect indexing - use per-slot offsets as well. */
1692          const fs_reg srcs[] = { icp_handle, indirect_offset };
1693          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1694          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
1695
1696          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
1697          inst->offset = imm_offset;
1698          inst->base_mrf = -1;
1699          inst->mlen = 2;
1700          inst->regs_written = num_components;
1701       }
1702
1703       if (is_point_size) {
1704          /* Read the whole VUE header (because of alignment) and read .w. */
1705          fs_reg tmp = bld.vgrf(dst.type, 4);
1706          inst->dst = tmp;
1707          inst->regs_written = 4;
1708          bld.MOV(dst, offset(tmp, bld, 3));
1709       }
1710    }
1711 }
1712
1713 void
1714 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
1715                                   nir_intrinsic_instr *instr)
1716 {
1717    assert(stage == MESA_SHADER_VERTEX);
1718
1719    fs_reg dest;
1720    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1721       dest = get_nir_dest(instr->dest);
1722
1723    switch (instr->intrinsic) {
1724    case nir_intrinsic_load_vertex_id:
1725       unreachable("should be lowered by lower_vertex_id()");
1726
1727    case nir_intrinsic_load_vertex_id_zero_base:
1728    case nir_intrinsic_load_base_vertex:
1729    case nir_intrinsic_load_instance_id: {
1730       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
1731       fs_reg val = nir_system_values[sv];
1732       assert(val.file != BAD_FILE);
1733       dest.type = val.type;
1734       bld.MOV(dest, val);
1735       break;
1736    }
1737
1738    default:
1739       nir_emit_intrinsic(bld, instr);
1740       break;
1741    }
1742 }
1743
1744 void
1745 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
1746                                   nir_intrinsic_instr *instr)
1747 {
1748    assert(stage == MESA_SHADER_GEOMETRY);
1749    fs_reg indirect_offset;
1750
1751    fs_reg dest;
1752    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1753       dest = get_nir_dest(instr->dest);
1754
1755    switch (instr->intrinsic) {
1756    case nir_intrinsic_load_primitive_id:
1757       assert(stage == MESA_SHADER_GEOMETRY);
1758       assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
1759       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
1760               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
1761       break;
1762
1763    case nir_intrinsic_load_input_indirect:
1764    case nir_intrinsic_load_input:
1765       unreachable("load_input intrinsics are invalid for the GS stage");
1766
1767    case nir_intrinsic_load_per_vertex_input_indirect:
1768       indirect_offset = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_D);
1769       /* fallthrough */
1770    case nir_intrinsic_load_per_vertex_input:
1771       emit_gs_input_load(dest, instr->src[0],
1772                          indirect_offset, instr->const_index[0],
1773                          instr->num_components);
1774       break;
1775
1776    case nir_intrinsic_emit_vertex_with_counter:
1777       emit_gs_vertex(instr->src[0], instr->const_index[0]);
1778       break;
1779
1780    case nir_intrinsic_end_primitive_with_counter:
1781       emit_gs_end_primitive(instr->src[0]);
1782       break;
1783
1784    case nir_intrinsic_set_vertex_count:
1785       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
1786       break;
1787
1788    case nir_intrinsic_load_invocation_id: {
1789       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
1790       assert(val.file != BAD_FILE);
1791       dest.type = val.type;
1792       bld.MOV(dest, val);
1793       break;
1794    }
1795
1796    default:
1797       nir_emit_intrinsic(bld, instr);
1798       break;
1799    }
1800 }
1801
1802 void
1803 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
1804                                   nir_intrinsic_instr *instr)
1805 {
1806    assert(stage == MESA_SHADER_FRAGMENT);
1807    struct brw_wm_prog_data *wm_prog_data =
1808       (struct brw_wm_prog_data *) prog_data;
1809
1810    fs_reg dest;
1811    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1812       dest = get_nir_dest(instr->dest);
1813
1814    switch (instr->intrinsic) {
1815    case nir_intrinsic_load_front_face:
1816       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
1817               *emit_frontfacing_interpolation());
1818       break;
1819
1820    case nir_intrinsic_load_sample_pos: {
1821       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
1822       assert(sample_pos.file != BAD_FILE);
1823       dest.type = sample_pos.type;
1824       bld.MOV(dest, sample_pos);
1825       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
1826       break;
1827    }
1828
1829    case nir_intrinsic_load_helper_invocation:
1830    case nir_intrinsic_load_sample_mask_in:
1831    case nir_intrinsic_load_sample_id: {
1832       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
1833       fs_reg val = nir_system_values[sv];
1834       assert(val.file != BAD_FILE);
1835       dest.type = val.type;
1836       bld.MOV(dest, val);
1837       break;
1838    }
1839
1840    case nir_intrinsic_discard:
1841    case nir_intrinsic_discard_if: {
1842       /* We track our discarded pixels in f0.1.  By predicating on it, we can
1843        * update just the flag bits that aren't yet discarded.  If there's no
1844        * condition, we emit a CMP of g0 != g0, so all currently executing
1845        * channels will get turned off.
1846        */
1847       fs_inst *cmp;
1848       if (instr->intrinsic == nir_intrinsic_discard_if) {
1849          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
1850                        brw_imm_d(0), BRW_CONDITIONAL_Z);
1851       } else {
1852          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1853                                        BRW_REGISTER_TYPE_UW));
1854          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
1855       }
1856       cmp->predicate = BRW_PREDICATE_NORMAL;
1857       cmp->flag_subreg = 1;
1858
1859       if (devinfo->gen >= 6) {
1860          emit_discard_jump();
1861       }
1862       break;
1863    }
1864
1865    case nir_intrinsic_interp_var_at_centroid:
1866    case nir_intrinsic_interp_var_at_sample:
1867    case nir_intrinsic_interp_var_at_offset: {
1868       /* Handle ARB_gpu_shader5 interpolation intrinsics
1869        *
1870        * It's worth a quick word of explanation as to why we handle the full
1871        * variable-based interpolation intrinsic rather than a lowered version
1872        * with like we do for other inputs.  We have to do that because the way
1873        * we set up inputs doesn't allow us to use the already setup inputs for
1874        * interpolation.  At the beginning of the shader, we go through all of
1875        * the input variables and do the initial interpolation and put it in
1876        * the nir_inputs array based on its location as determined in
1877        * nir_lower_io.  If the input isn't used, dead code cleans up and
1878        * everything works fine.  However, when we get to the ARB_gpu_shader5
1879        * interpolation intrinsics, we need to reinterpolate the input
1880        * differently.  If we used an intrinsic that just had an index it would
1881        * only give us the offset into the nir_inputs array.  However, this is
1882        * useless because that value is post-interpolation and we need
1883        * pre-interpolation.  In order to get the actual location of the bits
1884        * we get from the vertex fetching hardware, we need the variable.
1885        */
1886       wm_prog_data->pulls_bary = true;
1887
1888       fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
1889       const glsl_interp_qualifier interpolation =
1890          (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
1891
1892       switch (instr->intrinsic) {
1893       case nir_intrinsic_interp_var_at_centroid:
1894          emit_pixel_interpolater_send(bld,
1895                                       FS_OPCODE_INTERPOLATE_AT_CENTROID,
1896                                       dst_xy,
1897                                       fs_reg(), /* src */
1898                                       brw_imm_ud(0u),
1899                                       interpolation);
1900          break;
1901
1902       case nir_intrinsic_interp_var_at_sample: {
1903          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
1904
1905          if (const_sample) {
1906             unsigned msg_data = const_sample->i[0] << 4;
1907
1908             emit_pixel_interpolater_send(bld,
1909                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
1910                                          dst_xy,
1911                                          fs_reg(), /* src */
1912                                          brw_imm_ud(msg_data),
1913                                          interpolation);
1914          } else {
1915             const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
1916                                              BRW_REGISTER_TYPE_UD);
1917
1918             if (nir_src_is_dynamically_uniform(instr->src[0])) {
1919                const fs_reg sample_id = bld.emit_uniformize(sample_src);
1920                const fs_reg msg_data = vgrf(glsl_type::uint_type);
1921                bld.exec_all().group(1, 0)
1922                   .SHL(msg_data, sample_id, brw_imm_ud(4u));
1923                emit_pixel_interpolater_send(bld,
1924                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
1925                                             dst_xy,
1926                                             fs_reg(), /* src */
1927                                             msg_data,
1928                                             interpolation);
1929             } else {
1930                /* Make a loop that sends a message to the pixel interpolater
1931                 * for the sample number in each live channel. If there are
1932                 * multiple channels with the same sample number then these
1933                 * will be handled simultaneously with a single interation of
1934                 * the loop.
1935                 */
1936                bld.emit(BRW_OPCODE_DO);
1937
1938                /* Get the next live sample number into sample_id_reg */
1939                const fs_reg sample_id = bld.emit_uniformize(sample_src);
1940
1941                /* Set the flag register so that we can perform the send
1942                 * message on all channels that have the same sample number
1943                 */
1944                bld.CMP(bld.null_reg_ud(),
1945                        sample_src, sample_id,
1946                        BRW_CONDITIONAL_EQ);
1947                const fs_reg msg_data = vgrf(glsl_type::uint_type);
1948                bld.exec_all().group(1, 0)
1949                   .SHL(msg_data, sample_id, brw_imm_ud(4u));
1950                fs_inst *inst =
1951                   emit_pixel_interpolater_send(bld,
1952                                                FS_OPCODE_INTERPOLATE_AT_SAMPLE,
1953                                                dst_xy,
1954                                                fs_reg(), /* src */
1955                                                msg_data,
1956                                                interpolation);
1957                set_predicate(BRW_PREDICATE_NORMAL, inst);
1958
1959                /* Continue the loop if there are any live channels left */
1960                set_predicate_inv(BRW_PREDICATE_NORMAL,
1961                                  true, /* inverse */
1962                                  bld.emit(BRW_OPCODE_WHILE));
1963             }
1964          }
1965
1966          break;
1967       }
1968
1969       case nir_intrinsic_interp_var_at_offset: {
1970          nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
1971
1972          if (const_offset) {
1973             unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
1974             unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
1975
1976             emit_pixel_interpolater_send(bld,
1977                                          FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
1978                                          dst_xy,
1979                                          fs_reg(), /* src */
1980                                          brw_imm_ud(off_x | (off_y << 4)),
1981                                          interpolation);
1982          } else {
1983             fs_reg src = vgrf(glsl_type::ivec2_type);
1984             fs_reg offset_src = retype(get_nir_src(instr->src[0]),
1985                                        BRW_REGISTER_TYPE_F);
1986             for (int i = 0; i < 2; i++) {
1987                fs_reg temp = vgrf(glsl_type::float_type);
1988                bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
1989                fs_reg itemp = vgrf(glsl_type::int_type);
1990                bld.MOV(itemp, temp);  /* float to int */
1991
1992                /* Clamp the upper end of the range to +7/16.
1993                 * ARB_gpu_shader5 requires that we support a maximum offset
1994                 * of +0.5, which isn't representable in a S0.4 value -- if
1995                 * we didn't clamp it, we'd end up with -8/16, which is the
1996                 * opposite of what the shader author wanted.
1997                 *
1998                 * This is legal due to ARB_gpu_shader5's quantization
1999                 * rules:
2000                 *
2001                 * "Not all values of <offset> may be supported; x and y
2002                 * offsets may be rounded to fixed-point values with the
2003                 * number of fraction bits given by the
2004                 * implementation-dependent constant
2005                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
2006                 */
2007                set_condmod(BRW_CONDITIONAL_L,
2008                            bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
2009             }
2010
2011             const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
2012             emit_pixel_interpolater_send(bld,
2013                                          opcode,
2014                                          dst_xy,
2015                                          src,
2016                                          brw_imm_ud(0u),
2017                                          interpolation);
2018          }
2019          break;
2020       }
2021
2022       default:
2023          unreachable("Invalid intrinsic");
2024       }
2025
2026       for (unsigned j = 0; j < instr->num_components; j++) {
2027          fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
2028          src.type = dest.type;
2029
2030          bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
2031          dest = offset(dest, bld, 1);
2032       }
2033       break;
2034    }
2035    default:
2036       nir_emit_intrinsic(bld, instr);
2037       break;
2038    }
2039 }
2040
2041 void
2042 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
2043                                   nir_intrinsic_instr *instr)
2044 {
2045    assert(stage == MESA_SHADER_COMPUTE);
2046    struct brw_cs_prog_data *cs_prog_data =
2047       (struct brw_cs_prog_data *) prog_data;
2048
2049    fs_reg dest;
2050    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2051       dest = get_nir_dest(instr->dest);
2052
2053    switch (instr->intrinsic) {
2054    case nir_intrinsic_barrier:
2055       emit_barrier();
2056       cs_prog_data->uses_barrier = true;
2057       break;
2058
2059    case nir_intrinsic_load_local_invocation_id:
2060    case nir_intrinsic_load_work_group_id: {
2061       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
2062       fs_reg val = nir_system_values[sv];
2063       assert(val.file != BAD_FILE);
2064       dest.type = val.type;
2065       for (unsigned i = 0; i < 3; i++)
2066          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
2067       break;
2068    }
2069
2070    case nir_intrinsic_load_num_work_groups: {
2071       const unsigned surface =
2072          cs_prog_data->binding_table.work_groups_start;
2073
2074       cs_prog_data->uses_num_work_groups = true;
2075
2076       fs_reg surf_index = brw_imm_ud(surface);
2077       brw_mark_surface_used(prog_data, surface);
2078
2079       /* Read the 3 GLuint components of gl_NumWorkGroups */
2080       for (unsigned i = 0; i < 3; i++) {
2081          fs_reg read_result =
2082             emit_untyped_read(bld, surf_index,
2083                               brw_imm_ud(i << 2),
2084                               1 /* dims */, 1 /* size */,
2085                               BRW_PREDICATE_NONE);
2086          read_result.type = dest.type;
2087          bld.MOV(dest, read_result);
2088          dest = offset(dest, bld, 1);
2089       }
2090       break;
2091    }
2092
2093    default:
2094       nir_emit_intrinsic(bld, instr);
2095       break;
2096    }
2097 }
2098
2099 void
2100 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
2101 {
2102    fs_reg dest;
2103    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2104       dest = get_nir_dest(instr->dest);
2105
2106    bool has_indirect = false;
2107
2108    switch (instr->intrinsic) {
2109    case nir_intrinsic_atomic_counter_inc:
2110    case nir_intrinsic_atomic_counter_dec:
2111    case nir_intrinsic_atomic_counter_read: {
2112       using namespace surface_access;
2113
2114       /* Get the arguments of the atomic intrinsic. */
2115       const fs_reg offset = get_nir_src(instr->src[0]);
2116       const unsigned surface = (stage_prog_data->binding_table.abo_start +
2117                                 instr->const_index[0]);
2118       fs_reg tmp;
2119
2120       /* Emit a surface read or atomic op. */
2121       switch (instr->intrinsic) {
2122       case nir_intrinsic_atomic_counter_read:
2123          tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
2124          break;
2125
2126       case nir_intrinsic_atomic_counter_inc:
2127          tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, fs_reg(),
2128                                    fs_reg(), 1, 1, BRW_AOP_INC);
2129          break;
2130
2131       case nir_intrinsic_atomic_counter_dec:
2132          tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, fs_reg(),
2133                                    fs_reg(), 1, 1, BRW_AOP_PREDEC);
2134          break;
2135
2136       default:
2137          unreachable("Unreachable");
2138       }
2139
2140       /* Assign the result. */
2141       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
2142
2143       /* Mark the surface as used. */
2144       brw_mark_surface_used(stage_prog_data, surface);
2145       break;
2146    }
2147
2148    case nir_intrinsic_image_load:
2149    case nir_intrinsic_image_store:
2150    case nir_intrinsic_image_atomic_add:
2151    case nir_intrinsic_image_atomic_min:
2152    case nir_intrinsic_image_atomic_max:
2153    case nir_intrinsic_image_atomic_and:
2154    case nir_intrinsic_image_atomic_or:
2155    case nir_intrinsic_image_atomic_xor:
2156    case nir_intrinsic_image_atomic_exchange:
2157    case nir_intrinsic_image_atomic_comp_swap: {
2158       using namespace image_access;
2159
2160       /* Get the referenced image variable and type. */
2161       const nir_variable *var = instr->variables[0]->var;
2162       const glsl_type *type = var->type->without_array();
2163       const brw_reg_type base_type = get_image_base_type(type);
2164
2165       /* Get some metadata from the image intrinsic. */
2166       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
2167       const unsigned arr_dims = type->sampler_array ? 1 : 0;
2168       const unsigned surf_dims = type->coordinate_components() - arr_dims;
2169       const mesa_format format =
2170          (var->data.image.write_only ? MESA_FORMAT_NONE :
2171           _mesa_get_shader_image_format(var->data.image.format));
2172
2173       /* Get the arguments of the image intrinsic. */
2174       const fs_reg image = get_nir_image_deref(instr->variables[0]);
2175       const fs_reg addr = retype(get_nir_src(instr->src[0]),
2176                                  BRW_REGISTER_TYPE_UD);
2177       const fs_reg src0 = (info->num_srcs >= 3 ?
2178                            retype(get_nir_src(instr->src[2]), base_type) :
2179                            fs_reg());
2180       const fs_reg src1 = (info->num_srcs >= 4 ?
2181                            retype(get_nir_src(instr->src[3]), base_type) :
2182                            fs_reg());
2183       fs_reg tmp;
2184
2185       /* Emit an image load, store or atomic op. */
2186       if (instr->intrinsic == nir_intrinsic_image_load)
2187          tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
2188
2189       else if (instr->intrinsic == nir_intrinsic_image_store)
2190          emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, format);
2191
2192       else
2193          tmp = emit_image_atomic(bld, image, addr, src0, src1,
2194                                  surf_dims, arr_dims, info->dest_components,
2195                                  get_image_atomic_op(instr->intrinsic, type));
2196
2197       /* Assign the result. */
2198       for (unsigned c = 0; c < info->dest_components; ++c)
2199          bld.MOV(offset(retype(dest, base_type), bld, c),
2200                  offset(tmp, bld, c));
2201       break;
2202    }
2203
2204    case nir_intrinsic_memory_barrier_atomic_counter:
2205    case nir_intrinsic_memory_barrier_buffer:
2206    case nir_intrinsic_memory_barrier_image:
2207    case nir_intrinsic_memory_barrier: {
2208       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
2209       bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
2210          ->regs_written = 2;
2211       break;
2212    }
2213
2214    case nir_intrinsic_group_memory_barrier:
2215    case nir_intrinsic_memory_barrier_shared:
2216       /* We treat these workgroup-level barriers as no-ops.  This should be
2217        * safe at present and as long as:
2218        *
2219        *  - Memory access instructions are not subsequently reordered by the
2220        *    compiler back-end.
2221        *
2222        *  - All threads from a given compute shader workgroup fit within a
2223        *    single subslice and therefore talk to the same HDC shared unit
2224        *    what supposedly guarantees ordering and coherency between threads
2225        *    from the same workgroup.  This may change in the future when we
2226        *    start splitting workgroups across multiple subslices.
2227        *
2228        *  - The context is not in fault-and-stream mode, which could cause
2229        *    memory transactions (including to SLM) prior to the barrier to be
2230        *    replayed after the barrier if a pagefault occurs.  This shouldn't
2231        *    be a problem up to and including SKL because fault-and-stream is
2232        *    not usable due to hardware issues, but that's likely to change in
2233        *    the future.
2234        */
2235       break;
2236
2237    case nir_intrinsic_shader_clock: {
2238       /* We cannot do anything if there is an event, so ignore it for now */
2239       fs_reg shader_clock = get_timestamp(bld);
2240       const fs_reg srcs[] = { shader_clock.set_smear(0), shader_clock.set_smear(1) };
2241
2242       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
2243       break;
2244    }
2245
2246    case nir_intrinsic_image_size: {
2247       /* Get the referenced image variable and type. */
2248       const nir_variable *var = instr->variables[0]->var;
2249       const glsl_type *type = var->type->without_array();
2250
2251       /* Get the size of the image. */
2252       const fs_reg image = get_nir_image_deref(instr->variables[0]);
2253       const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
2254
2255       /* For 1DArray image types, the array index is stored in the Z component.
2256        * Fix this by swizzling the Z component to the Y component.
2257        */
2258       const bool is_1d_array_image =
2259                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
2260                   type->sampler_array;
2261
2262       /* For CubeArray images, we should count the number of cubes instead
2263        * of the number of faces. Fix it by dividing the (Z component) by 6.
2264        */
2265       const bool is_cube_array_image =
2266                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2267                   type->sampler_array;
2268
2269       /* Copy all the components. */
2270       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
2271       for (unsigned c = 0; c < info->dest_components; ++c) {
2272          if ((int)c >= type->coordinate_components()) {
2273              bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
2274                      brw_imm_d(1));
2275          } else if (c == 1 && is_1d_array_image) {
2276             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
2277                     offset(size, bld, 2));
2278          } else if (c == 2 && is_cube_array_image) {
2279             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
2280                      offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
2281                      offset(size, bld, c), brw_imm_d(6));
2282          } else {
2283             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
2284                     offset(size, bld, c));
2285          }
2286        }
2287
2288       break;
2289    }
2290
2291    case nir_intrinsic_image_samples:
2292       /* The driver does not support multi-sampled images. */
2293       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
2294       break;
2295
2296    case nir_intrinsic_load_uniform_indirect:
2297       has_indirect = true;
2298       /* fallthrough */
2299    case nir_intrinsic_load_uniform: {
2300       fs_reg uniform_reg(UNIFORM, instr->const_index[0]);
2301       uniform_reg.reg_offset = instr->const_index[1];
2302
2303       for (unsigned j = 0; j < instr->num_components; j++) {
2304          fs_reg src = offset(retype(uniform_reg, dest.type), bld, j);
2305          if (has_indirect)
2306             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
2307
2308          bld.MOV(dest, src);
2309          dest = offset(dest, bld, 1);
2310       }
2311       break;
2312    }
2313
2314    case nir_intrinsic_load_ubo_indirect:
2315       has_indirect = true;
2316       /* fallthrough */
2317    case nir_intrinsic_load_ubo: {
2318       nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
2319       fs_reg surf_index;
2320
2321       if (const_index) {
2322          const unsigned index = stage_prog_data->binding_table.ubo_start +
2323                                 const_index->u[0];
2324          surf_index = brw_imm_ud(index);
2325          brw_mark_surface_used(prog_data, index);
2326       } else {
2327          /* The block index is not a constant. Evaluate the index expression
2328           * per-channel and add the base UBO index; we have to select a value
2329           * from any live channel.
2330           */
2331          surf_index = vgrf(glsl_type::uint_type);
2332          bld.ADD(surf_index, get_nir_src(instr->src[0]),
2333                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
2334          surf_index = bld.emit_uniformize(surf_index);
2335
2336          /* Assume this may touch any UBO. It would be nice to provide
2337           * a tighter bound, but the array information is already lowered away.
2338           */
2339          brw_mark_surface_used(prog_data,
2340                                stage_prog_data->binding_table.ubo_start +
2341                                nir->info.num_ubos - 1);
2342       }
2343
2344       if (has_indirect) {
2345          /* Turn the byte offset into a dword offset. */
2346          fs_reg base_offset = vgrf(glsl_type::int_type);
2347          bld.SHR(base_offset, retype(get_nir_src(instr->src[1]),
2348                                      BRW_REGISTER_TYPE_D),
2349                  brw_imm_d(2));
2350
2351          unsigned vec4_offset = instr->const_index[0] / 4;
2352          for (int i = 0; i < instr->num_components; i++)
2353             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
2354                                        base_offset, vec4_offset + i);
2355       } else {
2356          fs_reg packed_consts = vgrf(glsl_type::float_type);
2357          packed_consts.type = dest.type;
2358
2359          struct brw_reg const_offset_reg = brw_imm_ud(instr->const_index[0] & ~15);
2360          bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
2361                   surf_index, const_offset_reg);
2362
2363          for (unsigned i = 0; i < instr->num_components; i++) {
2364             packed_consts.set_smear(instr->const_index[0] % 16 / 4 + i);
2365
2366             /* The std140 packing rules don't allow vectors to cross 16-byte
2367              * boundaries, and a reg is 32 bytes.
2368              */
2369             assert(packed_consts.subreg_offset < 32);
2370
2371             bld.MOV(dest, packed_consts);
2372             dest = offset(dest, bld, 1);
2373          }
2374       }
2375       break;
2376    }
2377
2378    case nir_intrinsic_load_ssbo_indirect:
2379       has_indirect = true;
2380       /* fallthrough */
2381    case nir_intrinsic_load_ssbo: {
2382       assert(devinfo->gen >= 7);
2383
2384       nir_const_value *const_uniform_block =
2385          nir_src_as_const_value(instr->src[0]);
2386
2387       fs_reg surf_index;
2388       if (const_uniform_block) {
2389          unsigned index = stage_prog_data->binding_table.ssbo_start +
2390                           const_uniform_block->u[0];
2391          surf_index = brw_imm_ud(index);
2392          brw_mark_surface_used(prog_data, index);
2393       } else {
2394          surf_index = vgrf(glsl_type::uint_type);
2395          bld.ADD(surf_index, get_nir_src(instr->src[0]),
2396                  brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
2397
2398          /* Assume this may touch any UBO. It would be nice to provide
2399           * a tighter bound, but the array information is already lowered away.
2400           */
2401          brw_mark_surface_used(prog_data,
2402                                stage_prog_data->binding_table.ssbo_start +
2403                                nir->info.num_ssbos - 1);
2404       }
2405
2406       /* Get the offset to read from */
2407       fs_reg offset_reg;
2408       if (has_indirect) {
2409          offset_reg = get_nir_src(instr->src[1]);
2410       } else {
2411          offset_reg = brw_imm_ud(instr->const_index[0]);
2412       }
2413
2414       /* Read the vector */
2415       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
2416                                              1 /* dims */,
2417                                              instr->num_components,
2418                                              BRW_PREDICATE_NONE);
2419       read_result.type = dest.type;
2420       for (int i = 0; i < instr->num_components; i++)
2421          bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
2422
2423       break;
2424    }
2425
2426    case nir_intrinsic_load_input_indirect:
2427       has_indirect = true;
2428       /* fallthrough */
2429    case nir_intrinsic_load_input: {
2430       unsigned index = 0;
2431       for (unsigned j = 0; j < instr->num_components; j++) {
2432          fs_reg src;
2433          if (stage == MESA_SHADER_VERTEX) {
2434             src = offset(fs_reg(ATTR, instr->const_index[0], dest.type), bld, index);
2435          } else {
2436             src = offset(retype(nir_inputs, dest.type), bld,
2437                          instr->const_index[0] + index);
2438          }
2439          if (has_indirect)
2440             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
2441          index++;
2442
2443          bld.MOV(dest, src);
2444          dest = offset(dest, bld, 1);
2445       }
2446       break;
2447    }
2448
2449    case nir_intrinsic_store_ssbo_indirect:
2450       has_indirect = true;
2451       /* fallthrough */
2452    case nir_intrinsic_store_ssbo: {
2453       assert(devinfo->gen >= 7);
2454
2455       /* Block index */
2456       fs_reg surf_index;
2457       nir_const_value *const_uniform_block =
2458          nir_src_as_const_value(instr->src[1]);
2459       if (const_uniform_block) {
2460          unsigned index = stage_prog_data->binding_table.ssbo_start +
2461                           const_uniform_block->u[0];
2462          surf_index = brw_imm_ud(index);
2463          brw_mark_surface_used(prog_data, index);
2464       } else {
2465          surf_index = vgrf(glsl_type::uint_type);
2466          bld.ADD(surf_index, get_nir_src(instr->src[1]),
2467                   brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
2468
2469          brw_mark_surface_used(prog_data,
2470                                stage_prog_data->binding_table.ssbo_start +
2471                                nir->info.num_ssbos - 1);
2472       }
2473
2474       /* Value */
2475       fs_reg val_reg = get_nir_src(instr->src[0]);
2476
2477       /* Writemask */
2478       unsigned writemask = instr->const_index[1];
2479
2480       /* Combine groups of consecutive enabled channels in one write
2481        * message. We use ffs to find the first enabled channel and then ffs on
2482        * the bit-inverse, down-shifted writemask to determine the length of
2483        * the block of enabled bits.
2484        */
2485       while (writemask) {
2486          unsigned first_component = ffs(writemask) - 1;
2487          unsigned length = ffs(~(writemask >> first_component)) - 1;
2488          fs_reg offset_reg;
2489
2490          if (!has_indirect) {
2491             offset_reg = brw_imm_ud(instr->const_index[0] + 4 * first_component);
2492          } else {
2493             offset_reg = vgrf(glsl_type::uint_type);
2494             bld.ADD(offset_reg,
2495                     retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
2496                     brw_imm_ud(4 * first_component));
2497          }
2498
2499          emit_untyped_write(bld, surf_index, offset_reg,
2500                             offset(val_reg, bld, first_component),
2501                             1 /* dims */, length,
2502                             BRW_PREDICATE_NONE);
2503
2504          /* Clear the bits in the writemask that we just wrote, then try
2505           * again to see if more channels are left.
2506           */
2507          writemask &= (15 << (first_component + length));
2508       }
2509       break;
2510    }
2511
2512    case nir_intrinsic_store_output_indirect:
2513       has_indirect = true;
2514       /* fallthrough */
2515    case nir_intrinsic_store_output: {
2516       fs_reg src = get_nir_src(instr->src[0]);
2517       unsigned index = 0;
2518       for (unsigned j = 0; j < instr->num_components; j++) {
2519          fs_reg new_dest = offset(retype(nir_outputs, src.type), bld,
2520                                   instr->const_index[0] + index);
2521          if (has_indirect)
2522             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
2523          index++;
2524          bld.MOV(new_dest, src);
2525          src = offset(src, bld, 1);
2526       }
2527       break;
2528    }
2529
2530    case nir_intrinsic_ssbo_atomic_add:
2531       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
2532       break;
2533    case nir_intrinsic_ssbo_atomic_imin:
2534       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
2535       break;
2536    case nir_intrinsic_ssbo_atomic_umin:
2537       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
2538       break;
2539    case nir_intrinsic_ssbo_atomic_imax:
2540       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
2541       break;
2542    case nir_intrinsic_ssbo_atomic_umax:
2543       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
2544       break;
2545    case nir_intrinsic_ssbo_atomic_and:
2546       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
2547       break;
2548    case nir_intrinsic_ssbo_atomic_or:
2549       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
2550       break;
2551    case nir_intrinsic_ssbo_atomic_xor:
2552       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
2553       break;
2554    case nir_intrinsic_ssbo_atomic_exchange:
2555       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
2556       break;
2557    case nir_intrinsic_ssbo_atomic_comp_swap:
2558       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
2559       break;
2560
2561    case nir_intrinsic_get_buffer_size: {
2562       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
2563       unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
2564       int reg_width = dispatch_width / 8;
2565
2566       /* Set LOD = 0 */
2567       fs_reg source = brw_imm_d(0);
2568
2569       int mlen = 1 * reg_width;
2570
2571       /* A resinfo's sampler message is used to get the buffer size.
2572        * The SIMD8's writeback message consists of four registers and
2573        * SIMD16's writeback message consists of 8 destination registers
2574        * (two per each component), although we are only interested on the
2575        * first component, where resinfo returns the buffer size for
2576        * SURFTYPE_BUFFER.
2577        */
2578       int regs_written = 4 * mlen;
2579       fs_reg src_payload = fs_reg(VGRF, alloc.allocate(mlen),
2580                                   BRW_REGISTER_TYPE_UD);
2581       bld.LOAD_PAYLOAD(src_payload, &source, 1, 0);
2582       fs_reg buffer_size = fs_reg(VGRF, alloc.allocate(regs_written),
2583                                   BRW_REGISTER_TYPE_UD);
2584       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
2585       fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, buffer_size,
2586                                src_payload, brw_imm_ud(index));
2587       inst->header_size = 0;
2588       inst->mlen = mlen;
2589       inst->regs_written = regs_written;
2590       bld.emit(inst);
2591       bld.MOV(retype(dest, buffer_size.type), buffer_size);
2592
2593       brw_mark_surface_used(prog_data, index);
2594       break;
2595    }
2596
2597    default:
2598       unreachable("unknown intrinsic");
2599    }
2600 }
2601
2602 void
2603 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
2604                                  int op, nir_intrinsic_instr *instr)
2605 {
2606    fs_reg dest;
2607    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2608       dest = get_nir_dest(instr->dest);
2609
2610    fs_reg surface;
2611    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
2612    if (const_surface) {
2613       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
2614                             const_surface->u[0];
2615       surface = brw_imm_ud(surf_index);
2616       brw_mark_surface_used(prog_data, surf_index);
2617    } else {
2618       surface = vgrf(glsl_type::uint_type);
2619       bld.ADD(surface, get_nir_src(instr->src[0]),
2620               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
2621
2622       /* Assume this may touch any SSBO. This is the same we do for other
2623        * UBO/SSBO accesses with non-constant surface.
2624        */
2625       brw_mark_surface_used(prog_data,
2626                             stage_prog_data->binding_table.ssbo_start +
2627                             nir->info.num_ssbos - 1);
2628    }
2629
2630    fs_reg offset = get_nir_src(instr->src[1]);
2631    fs_reg data1 = get_nir_src(instr->src[2]);
2632    fs_reg data2;
2633    if (op == BRW_AOP_CMPWR)
2634       data2 = get_nir_src(instr->src[3]);
2635
2636    /* Emit the actual atomic operation operation */
2637
2638    fs_reg atomic_result =
2639       surface_access::emit_untyped_atomic(bld, surface, offset,
2640                                           data1, data2,
2641                                           1 /* dims */, 1 /* rsize */,
2642                                           op,
2643                                           BRW_PREDICATE_NONE);
2644    dest.type = atomic_result.type;
2645    bld.MOV(dest, atomic_result);
2646 }
2647
2648 void
2649 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
2650 {
2651    unsigned sampler = instr->sampler_index;
2652    fs_reg sampler_reg(brw_imm_ud(sampler));
2653
2654    int gather_component = instr->component;
2655
2656    bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
2657                         instr->is_array;
2658
2659    int lod_components = 0;
2660    int UNUSED offset_components = 0;
2661
2662    fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
2663
2664    for (unsigned i = 0; i < instr->num_srcs; i++) {
2665       fs_reg src = get_nir_src(instr->src[i].src);
2666       switch (instr->src[i].src_type) {
2667       case nir_tex_src_bias:
2668          lod = retype(src, BRW_REGISTER_TYPE_F);
2669          break;
2670       case nir_tex_src_comparitor:
2671          shadow_comparitor = retype(src, BRW_REGISTER_TYPE_F);
2672          break;
2673       case nir_tex_src_coord:
2674          switch (instr->op) {
2675          case nir_texop_txf:
2676          case nir_texop_txf_ms:
2677          case nir_texop_samples_identical:
2678             coordinate = retype(src, BRW_REGISTER_TYPE_D);
2679             break;
2680          default:
2681             coordinate = retype(src, BRW_REGISTER_TYPE_F);
2682             break;
2683          }
2684          break;
2685       case nir_tex_src_ddx:
2686          lod = retype(src, BRW_REGISTER_TYPE_F);
2687          lod_components = nir_tex_instr_src_size(instr, i);
2688          break;
2689       case nir_tex_src_ddy:
2690          lod2 = retype(src, BRW_REGISTER_TYPE_F);
2691          break;
2692       case nir_tex_src_lod:
2693          switch (instr->op) {
2694          case nir_texop_txs:
2695             lod = retype(src, BRW_REGISTER_TYPE_UD);
2696             break;
2697          case nir_texop_txf:
2698             lod = retype(src, BRW_REGISTER_TYPE_D);
2699             break;
2700          default:
2701             lod = retype(src, BRW_REGISTER_TYPE_F);
2702             break;
2703          }
2704          break;
2705       case nir_tex_src_ms_index:
2706          sample_index = retype(src, BRW_REGISTER_TYPE_UD);
2707          break;
2708       case nir_tex_src_offset:
2709          tex_offset = retype(src, BRW_REGISTER_TYPE_D);
2710          if (instr->is_array)
2711             offset_components = instr->coord_components - 1;
2712          else
2713             offset_components = instr->coord_components;
2714          break;
2715       case nir_tex_src_projector:
2716          unreachable("should be lowered");
2717
2718       case nir_tex_src_sampler_offset: {
2719          /* Figure out the highest possible sampler index and mark it as used */
2720          uint32_t max_used = sampler + instr->sampler_array_size - 1;
2721          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
2722             max_used += stage_prog_data->binding_table.gather_texture_start;
2723          } else {
2724             max_used += stage_prog_data->binding_table.texture_start;
2725          }
2726          brw_mark_surface_used(prog_data, max_used);
2727
2728          /* Emit code to evaluate the actual indexing expression */
2729          sampler_reg = vgrf(glsl_type::uint_type);
2730          bld.ADD(sampler_reg, src, brw_imm_ud(sampler));
2731          sampler_reg = bld.emit_uniformize(sampler_reg);
2732          break;
2733       }
2734
2735       default:
2736          unreachable("unknown texture source");
2737       }
2738    }
2739
2740    if (instr->op == nir_texop_txf_ms ||
2741        instr->op == nir_texop_samples_identical) {
2742       if (devinfo->gen >= 7 &&
2743           key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
2744          mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg);
2745       } else {
2746          mcs = brw_imm_ud(0u);
2747       }
2748    }
2749
2750    for (unsigned i = 0; i < 3; i++) {
2751       if (instr->const_offset[i] != 0) {
2752          assert(offset_components == 0);
2753          tex_offset = brw_imm_ud(brw_texture_offset(instr->const_offset, 3));
2754          break;
2755       }
2756    }
2757
2758    enum glsl_base_type dest_base_type =
2759      brw_glsl_base_type_for_nir_type (instr->dest_type);
2760
2761    const glsl_type *dest_type =
2762       glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
2763                               1);
2764
2765    ir_texture_opcode op;
2766    switch (instr->op) {
2767    case nir_texop_lod: op = ir_lod; break;
2768    case nir_texop_query_levels: op = ir_query_levels; break;
2769    case nir_texop_tex: op = ir_tex; break;
2770    case nir_texop_tg4: op = ir_tg4; break;
2771    case nir_texop_txb: op = ir_txb; break;
2772    case nir_texop_txd: op = ir_txd; break;
2773    case nir_texop_txf: op = ir_txf; break;
2774    case nir_texop_txf_ms: op = ir_txf_ms; break;
2775    case nir_texop_txl: op = ir_txl; break;
2776    case nir_texop_txs: op = ir_txs; break;
2777    case nir_texop_texture_samples: {
2778       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
2779       fs_inst *inst = bld.emit(SHADER_OPCODE_SAMPLEINFO, dst,
2780                                bld.vgrf(BRW_REGISTER_TYPE_D, 1),
2781                                sampler_reg);
2782       inst->mlen = 1;
2783       inst->header_size = 1;
2784       inst->base_mrf = -1;
2785       return;
2786    }
2787    case nir_texop_samples_identical: op = ir_samples_identical; break;
2788    default:
2789       unreachable("unknown texture opcode");
2790    }
2791
2792    emit_texture(op, dest_type, coordinate, instr->coord_components,
2793                 shadow_comparitor, lod, lod2, lod_components, sample_index,
2794                 tex_offset, mcs, gather_component,
2795                 is_cube_array, sampler, sampler_reg);
2796
2797    fs_reg dest = get_nir_dest(instr->dest);
2798    dest.type = this->result.type;
2799    unsigned num_components = nir_tex_instr_dest_size(instr);
2800    emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
2801                              dest, this->result),
2802                 (1 << num_components) - 1);
2803 }
2804
2805 void
2806 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
2807 {
2808    switch (instr->type) {
2809    case nir_jump_break:
2810       bld.emit(BRW_OPCODE_BREAK);
2811       break;
2812    case nir_jump_continue:
2813       bld.emit(BRW_OPCODE_CONTINUE);
2814       break;
2815    case nir_jump_return:
2816    default:
2817       unreachable("unknown jump");
2818    }
2819 }