src/mesa/drivers/dri/i965/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "glsl/ir.h"
  25 #include "glsl/ir_optimization.h"
  26 #include "glsl/nir/glsl_to_nir.h"
  27 #include "main/shaderimage.h"
  28 #include "program/prog_to_nir.h"
  29 #include "brw_fs.h"
  30 #include "brw_fs_surface_builder.h"
  31 #include "brw_vec4_gs_visitor.h"
  32 #include "brw_nir.h"
  33 #include "brw_fs_surface_builder.h"
  34 #include "brw_vec4_gs_visitor.h"
  35
  36 using namespace brw;
  37 using namespace brw::surface_access;
  38
  39 void
  40 fs_visitor::emit_nir_code()
  41 {
  42    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  43     * be converted to reads/writes of these arrays
  44     */
  45    nir_setup_inputs();
  46    nir_setup_outputs();
  47    nir_setup_uniforms();
  48    nir_emit_system_values();
  49
  50    /* get the main function and emit it */
  51    nir_foreach_overload(nir, overload) {
  52       assert(strcmp(overload->function->name, "main") == 0);
  53       assert(overload->impl);
  54       nir_emit_impl(overload->impl);
  55    }
  56 }
  57
  58 void
  59 fs_visitor::nir_setup_inputs()
  60 {
  61    if (stage != MESA_SHADER_FRAGMENT)
  62       return;
  63
  64    nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
  65
  66    nir_foreach_variable(var, &nir->inputs) {
  67       fs_reg input = offset(nir_inputs, bld, var->data.driver_location);
  68
  69       fs_reg reg;
  70       if (var->data.location == VARYING_SLOT_POS) {
  71          reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
  72                                              var->data.origin_upper_left);
  73          emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
  74                                    input, reg), 0xF);
  75       } else if (var->data.location == VARYING_SLOT_LAYER) {
  76          struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_LAYER, 1), 3);
  77          reg.type = BRW_REGISTER_TYPE_D;
  78          bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
  79       } else if (var->data.location == VARYING_SLOT_VIEWPORT) {
  80          struct brw_reg reg = suboffset(interp_reg(VARYING_SLOT_VIEWPORT, 2), 3);
  81          reg.type = BRW_REGISTER_TYPE_D;
  82          bld.emit(FS_OPCODE_CINTERP, retype(input, BRW_REGISTER_TYPE_D), reg);
  83       } else {
  84          emit_general_interpolation(input, var->name, var->type,
  85                                     (glsl_interp_qualifier) var->data.interpolation,
  86                                     var->data.location, var->data.centroid,
  87                                     var->data.sample);
  88       }
  89    }
  90 }
  91
  92 void
  93 fs_visitor::nir_setup_outputs()
  94 {
  95    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  96
  97    nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
  98
  99    nir_foreach_variable(var, &nir->outputs) {
 100       fs_reg reg = offset(nir_outputs, bld, var->data.driver_location);
 101
 102       int vector_elements = var->type->without_array()->vector_elements;
 103
 104       switch (stage) {
 105       case MESA_SHADER_VERTEX:
 106       case MESA_SHADER_GEOMETRY:
 107          for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
 108             int output = var->data.location + i;
 109             this->outputs[output] = offset(reg, bld, 4 * i);
 110             this->output_components[output] = vector_elements;
 111          }
 112          break;
 113       case MESA_SHADER_FRAGMENT:
 114          if (var->data.index > 0) {
 115             assert(var->data.location == FRAG_RESULT_DATA0);
 116             assert(var->data.index == 1);
 117             this->dual_src_output = reg;
 118             this->do_dual_src = true;
 119          } else if (var->data.location == FRAG_RESULT_COLOR) {
 120             /* Writing gl_FragColor outputs to all color regions. */
 121             for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
 122                this->outputs[i] = reg;
 123                this->output_components[i] = 4;
 124             }
 125          } else if (var->data.location == FRAG_RESULT_DEPTH) {
 126             this->frag_depth = reg;
 127          } else if (var->data.location == FRAG_RESULT_STENCIL) {
 128             this->frag_stencil = reg;
 129          } else if (var->data.location == FRAG_RESULT_SAMPLE_MASK) {
 130             this->sample_mask = reg;
 131          } else {
 132             /* gl_FragData or a user-defined FS output */
 133             assert(var->data.location >= FRAG_RESULT_DATA0 &&
 134                    var->data.location < FRAG_RESULT_DATA0+BRW_MAX_DRAW_BUFFERS);
 135
 136             /* General color output. */
 137             for (unsigned int i = 0; i < MAX2(1, var->type->length); i++) {
 138                int output = var->data.location - FRAG_RESULT_DATA0 + i;
 139                this->outputs[output] = offset(reg, bld, vector_elements * i);
 140                this->output_components[output] = vector_elements;
 141             }
 142          }
 143          break;
 144       default:
 145          unreachable("unhandled shader stage");
 146       }
 147    }
 148 }
 149
 150 void
 151 fs_visitor::nir_setup_uniforms()
 152 {
 153    if (dispatch_width != 8)
 154       return;
 155
 156    uniforms = nir->num_uniforms;
 157
 158    nir_foreach_variable(var, &nir->uniforms) {
 159       /* UBO's and atomics don't take up space in the uniform file */
 160       if (var->interface_type != NULL || var->type->contains_atomic())
 161          continue;
 162
 163       if (type_size_scalar(var->type) > 0)
 164          param_size[var->data.driver_location] = type_size_scalar(var->type);
 165    }
 166 }
 167
 168 static bool
 169 emit_system_values_block(nir_block *block, void *void_visitor)
 170 {
 171    fs_visitor *v = (fs_visitor *)void_visitor;
 172    fs_reg *reg;
 173
 174    nir_foreach_instr(block, instr) {
 175       if (instr->type != nir_instr_type_intrinsic)
 176          continue;
 177
 178       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 179       switch (intrin->intrinsic) {
 180       case nir_intrinsic_load_vertex_id:
 181          unreachable("should be lowered by lower_vertex_id().");
 182
 183       case nir_intrinsic_load_vertex_id_zero_base:
 184          assert(v->stage == MESA_SHADER_VERTEX);
 185          reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
 186          if (reg->file == BAD_FILE)
 187             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
 188          break;
 189
 190       case nir_intrinsic_load_base_vertex:
 191          assert(v->stage == MESA_SHADER_VERTEX);
 192          reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
 193          if (reg->file == BAD_FILE)
 194             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
 195          break;
 196
 197       case nir_intrinsic_load_instance_id:
 198          assert(v->stage == MESA_SHADER_VERTEX);
 199          reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
 200          if (reg->file == BAD_FILE)
 201             *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
 202          break;
 203
 204       case nir_intrinsic_load_invocation_id:
 205          assert(v->stage == MESA_SHADER_GEOMETRY);
 206          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 207          if (reg->file == BAD_FILE) {
 208             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 209             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 210             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 211             abld.SHR(iid, g1, fs_reg(27u));
 212             *reg = iid;
 213          }
 214          break;
 215
 216       case nir_intrinsic_load_sample_pos:
 217          assert(v->stage == MESA_SHADER_FRAGMENT);
 218          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 219          if (reg->file == BAD_FILE)
 220             *reg = *v->emit_samplepos_setup();
 221          break;
 222
 223       case nir_intrinsic_load_sample_id:
 224          assert(v->stage == MESA_SHADER_FRAGMENT);
 225          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 226          if (reg->file == BAD_FILE)
 227             *reg = *v->emit_sampleid_setup();
 228          break;
 229
 230       case nir_intrinsic_load_sample_mask_in:
 231          assert(v->stage == MESA_SHADER_FRAGMENT);
 232          assert(v->devinfo->gen >= 7);
 233          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 234          if (reg->file == BAD_FILE)
 235             *reg = fs_reg(retype(brw_vec8_grf(v->payload.sample_mask_in_reg, 0),
 236                                  BRW_REGISTER_TYPE_D));
 237          break;
 238
 239       case nir_intrinsic_load_local_invocation_id:
 240          assert(v->stage == MESA_SHADER_COMPUTE);
 241          reg = &v->nir_system_values[SYSTEM_VALUE_LOCAL_INVOCATION_ID];
 242          if (reg->file == BAD_FILE)
 243             *reg = *v->emit_cs_local_invocation_id_setup();
 244          break;
 245
 246       case nir_intrinsic_load_work_group_id:
 247          assert(v->stage == MESA_SHADER_COMPUTE);
 248          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 249          if (reg->file == BAD_FILE)
 250             *reg = *v->emit_cs_work_group_id_setup();
 251          break;
 252
 253       default:
 254          break;
 255       }
 256    }
 257
 258    return true;
 259 }
 260
 261 void
 262 fs_visitor::nir_emit_system_values()
 263 {
 264    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 265    nir_foreach_overload(nir, overload) {
 266       assert(strcmp(overload->function->name, "main") == 0);
 267       assert(overload->impl);
 268       nir_foreach_block(overload->impl, emit_system_values_block, this);
 269    }
 270 }
 271
 272 void
 273 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 274 {
 275    nir_locals = reralloc(mem_ctx, nir_locals, fs_reg, impl->reg_alloc);
 276    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 277       unsigned array_elems =
 278          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 279       unsigned size = array_elems * reg->num_components;
 280       nir_locals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
 281    }
 282
 283    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 284                              impl->ssa_alloc);
 285
 286    nir_emit_cf_list(&impl->body);
 287 }
 288
 289 void
 290 fs_visitor::nir_emit_cf_list(exec_list *list)
 291 {
 292    exec_list_validate(list);
 293    foreach_list_typed(nir_cf_node, node, node, list) {
 294       switch (node->type) {
 295       case nir_cf_node_if:
 296          nir_emit_if(nir_cf_node_as_if(node));
 297          break;
 298
 299       case nir_cf_node_loop:
 300          nir_emit_loop(nir_cf_node_as_loop(node));
 301          break;
 302
 303       case nir_cf_node_block:
 304          nir_emit_block(nir_cf_node_as_block(node));
 305          break;
 306
 307       default:
 308          unreachable("Invalid CFG node block");
 309       }
 310    }
 311 }
 312
 313 void
 314 fs_visitor::nir_emit_if(nir_if *if_stmt)
 315 {
 316    /* first, put the condition into f0 */
 317    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 318                             retype(get_nir_src(if_stmt->condition),
 319                                    BRW_REGISTER_TYPE_D));
 320    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 321
 322    bld.IF(BRW_PREDICATE_NORMAL);
 323
 324    nir_emit_cf_list(&if_stmt->then_list);
 325
 326    /* note: if the else is empty, dead CF elimination will remove it */
 327    bld.emit(BRW_OPCODE_ELSE);
 328
 329    nir_emit_cf_list(&if_stmt->else_list);
 330
 331    bld.emit(BRW_OPCODE_ENDIF);
 332 }
 333
 334 void
 335 fs_visitor::nir_emit_loop(nir_loop *loop)
 336 {
 337    bld.emit(BRW_OPCODE_DO);
 338
 339    nir_emit_cf_list(&loop->body);
 340
 341    bld.emit(BRW_OPCODE_WHILE);
 342 }
 343
 344 void
 345 fs_visitor::nir_emit_block(nir_block *block)
 346 {
 347    nir_foreach_instr(block, instr) {
 348       nir_emit_instr(instr);
 349    }
 350 }
 351
 352 void
 353 fs_visitor::nir_emit_instr(nir_instr *instr)
 354 {
 355    const fs_builder abld = bld.annotate(NULL, instr);
 356
 357    switch (instr->type) {
 358    case nir_instr_type_alu:
 359       nir_emit_alu(abld, nir_instr_as_alu(instr));
 360       break;
 361
 362    case nir_instr_type_intrinsic:
 363       nir_emit_intrinsic(abld, nir_instr_as_intrinsic(instr));
 364       break;
 365
 366    case nir_instr_type_tex:
 367       nir_emit_texture(abld, nir_instr_as_tex(instr));
 368       break;
 369
 370    case nir_instr_type_load_const:
 371       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 372       break;
 373
 374    case nir_instr_type_ssa_undef:
 375       nir_emit_undef(abld, nir_instr_as_ssa_undef(instr));
 376       break;
 377
 378    case nir_instr_type_jump:
 379       nir_emit_jump(abld, nir_instr_as_jump(instr));
 380       break;
 381
 382    default:
 383       unreachable("unknown instruction type");
 384    }
 385 }
 386
 387 bool
 388 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 389                                          const fs_reg &result)
 390 {
 391    if (!instr->src[0].src.is_ssa ||
 392        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
 393       return false;
 394
 395    nir_intrinsic_instr *src0 =
 396       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 397
 398    if (src0->intrinsic != nir_intrinsic_load_front_face)
 399       return false;
 400
 401    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
 402    if (!value1 || fabsf(value1->f[0]) != 1.0f)
 403       return false;
 404
 405    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
 406    if (!value2 || fabsf(value2->f[0]) != 1.0f)
 407       return false;
 408
 409    fs_reg tmp = vgrf(glsl_type::int_type);
 410
 411    if (devinfo->gen >= 6) {
 412       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 413       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 414
 415       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 416        *
 417        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 418        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 419        *
 420        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 421        *
 422        * This negation looks like it's safe in practice, because bits 0:4 will
 423        * surely be TRIANGLES
 424        */
 425
 426       if (value1->f[0] == -1.0f) {
 427          g0.negate = true;
 428       }
 429
 430       tmp.type = BRW_REGISTER_TYPE_W;
 431       tmp.subreg_offset = 2;
 432       tmp.stride = 2;
 433
 434       fs_inst *or_inst = bld.OR(tmp, g0, fs_reg(0x3f80));
 435       or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
 436
 437       tmp.type = BRW_REGISTER_TYPE_D;
 438       tmp.subreg_offset = 0;
 439       tmp.stride = 1;
 440    } else {
 441       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 442       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 443
 444       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 445        *
 446        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 447        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 448        *
 449        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 450        *
 451        * This negation looks like it's safe in practice, because bits 0:4 will
 452        * surely be TRIANGLES
 453        */
 454
 455       if (value1->f[0] == -1.0f) {
 456          g1_6.negate = true;
 457       }
 458
 459       bld.OR(tmp, g1_6, fs_reg(0x3f800000));
 460    }
 461    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000));
 462
 463    return true;
 464 }
 465
 466 void
 467 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 468 {
 469    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 470    fs_inst *inst;
 471
 472    fs_reg result = get_nir_dest(instr->dest.dest);
 473    result.type = brw_type_for_nir_type(nir_op_infos[instr->op].output_type);
 474
 475    fs_reg op[4];
 476    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 477       op[i] = get_nir_src(instr->src[i].src);
 478       op[i].type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[i]);
 479       op[i].abs = instr->src[i].abs;
 480       op[i].negate = instr->src[i].negate;
 481    }
 482
 483    /* We get a bunch of mov's out of the from_ssa pass and they may still
 484     * be vectorized.  We'll handle them as a special-case.  We'll also
 485     * handle vecN here because it's basically the same thing.
 486     */
 487    switch (instr->op) {
 488    case nir_op_imov:
 489    case nir_op_fmov:
 490    case nir_op_vec2:
 491    case nir_op_vec3:
 492    case nir_op_vec4: {
 493       fs_reg temp = result;
 494       bool need_extra_copy = false;
 495       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 496          if (!instr->src[i].src.is_ssa &&
 497              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
 498             need_extra_copy = true;
 499             temp = bld.vgrf(result.type, 4);
 500             break;
 501          }
 502       }
 503
 504       for (unsigned i = 0; i < 4; i++) {
 505          if (!(instr->dest.write_mask & (1 << i)))
 506             continue;
 507
 508          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
 509             inst = bld.MOV(offset(temp, bld, i),
 510                            offset(op[0], bld, instr->src[0].swizzle[i]));
 511          } else {
 512             inst = bld.MOV(offset(temp, bld, i),
 513                            offset(op[i], bld, instr->src[i].swizzle[0]));
 514          }
 515          inst->saturate = instr->dest.saturate;
 516       }
 517
 518       /* In this case the source and destination registers were the same,
 519        * so we need to insert an extra set of moves in order to deal with
 520        * any swizzling.
 521        */
 522       if (need_extra_copy) {
 523          for (unsigned i = 0; i < 4; i++) {
 524             if (!(instr->dest.write_mask & (1 << i)))
 525                continue;
 526
 527             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
 528          }
 529       }
 530       return;
 531    }
 532    default:
 533       break;
 534    }
 535
 536    /* At this point, we have dealt with any instruction that operates on
 537     * more than a single channel.  Therefore, we can just adjust the source
 538     * and destination registers for that channel and emit the instruction.
 539     */
 540    unsigned channel = 0;
 541    if (nir_op_infos[instr->op].output_size == 0) {
 542       /* Since NIR is doing the scalarizing for us, we should only ever see
 543        * vectorized operations with a single channel.
 544        */
 545       assert(_mesa_bitcount(instr->dest.write_mask) == 1);
 546       channel = ffs(instr->dest.write_mask) - 1;
 547
 548       result = offset(result, bld, channel);
 549    }
 550
 551    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 552       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 553       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 554    }
 555
 556    switch (instr->op) {
 557    case nir_op_i2f:
 558    case nir_op_u2f:
 559       inst = bld.MOV(result, op[0]);
 560       inst->saturate = instr->dest.saturate;
 561       break;
 562
 563    case nir_op_f2i:
 564    case nir_op_f2u:
 565       bld.MOV(result, op[0]);
 566       break;
 567
 568    case nir_op_fsign: {
 569       /* AND(val, 0x80000000) gives the sign bit.
 570          *
 571          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 572          * zero.
 573          */
 574       bld.CMP(bld.null_reg_f(), op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
 575
 576       fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 577       op[0].type = BRW_REGISTER_TYPE_UD;
 578       result.type = BRW_REGISTER_TYPE_UD;
 579       bld.AND(result_int, op[0], fs_reg(0x80000000u));
 580
 581       inst = bld.OR(result_int, result_int, fs_reg(0x3f800000u));
 582       inst->predicate = BRW_PREDICATE_NORMAL;
 583       if (instr->dest.saturate) {
 584          inst = bld.MOV(result, result);
 585          inst->saturate = true;
 586       }
 587       break;
 588    }
 589
 590    case nir_op_isign:
 591       /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 592        *               -> non-negative val generates 0x00000000.
 593        *  Predicated OR sets 1 if val is positive.
 594        */
 595       bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_G);
 596       bld.ASR(result, op[0], fs_reg(31));
 597       inst = bld.OR(result, result, fs_reg(1));
 598       inst->predicate = BRW_PREDICATE_NORMAL;
 599       break;
 600
 601    case nir_op_frcp:
 602       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
 603       inst->saturate = instr->dest.saturate;
 604       break;
 605
 606    case nir_op_fexp2:
 607       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
 608       inst->saturate = instr->dest.saturate;
 609       break;
 610
 611    case nir_op_flog2:
 612       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
 613       inst->saturate = instr->dest.saturate;
 614       break;
 615
 616    case nir_op_fsin:
 617       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
 618       inst->saturate = instr->dest.saturate;
 619       break;
 620
 621    case nir_op_fcos:
 622       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
 623       inst->saturate = instr->dest.saturate;
 624       break;
 625
 626    case nir_op_fddx:
 627       if (fs_key->high_quality_derivatives) {
 628          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 629       } else {
 630          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 631       }
 632       inst->saturate = instr->dest.saturate;
 633       break;
 634    case nir_op_fddx_fine:
 635       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
 636       inst->saturate = instr->dest.saturate;
 637       break;
 638    case nir_op_fddx_coarse:
 639       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
 640       inst->saturate = instr->dest.saturate;
 641       break;
 642    case nir_op_fddy:
 643       if (fs_key->high_quality_derivatives) {
 644          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
 645                          fs_reg(fs_key->render_to_fbo));
 646       } else {
 647          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
 648                          fs_reg(fs_key->render_to_fbo));
 649       }
 650       inst->saturate = instr->dest.saturate;
 651       break;
 652    case nir_op_fddy_fine:
 653       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
 654                       fs_reg(fs_key->render_to_fbo));
 655       inst->saturate = instr->dest.saturate;
 656       break;
 657    case nir_op_fddy_coarse:
 658       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
 659                       fs_reg(fs_key->render_to_fbo));
 660       inst->saturate = instr->dest.saturate;
 661       break;
 662
 663    case nir_op_fadd:
 664    case nir_op_iadd:
 665       inst = bld.ADD(result, op[0], op[1]);
 666       inst->saturate = instr->dest.saturate;
 667       break;
 668
 669    case nir_op_fmul:
 670       inst = bld.MUL(result, op[0], op[1]);
 671       inst->saturate = instr->dest.saturate;
 672       break;
 673
 674    case nir_op_imul:
 675       bld.MUL(result, op[0], op[1]);
 676       break;
 677
 678    case nir_op_imul_high:
 679    case nir_op_umul_high:
 680       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
 681       break;
 682
 683    case nir_op_idiv:
 684    case nir_op_udiv:
 685       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
 686       break;
 687
 688    case nir_op_uadd_carry:
 689       unreachable("Should have been lowered by carry_to_arith().");
 690
 691    case nir_op_usub_borrow:
 692       unreachable("Should have been lowered by borrow_to_arith().");
 693
 694    case nir_op_umod:
 695       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
 696       break;
 697
 698    case nir_op_flt:
 699    case nir_op_ilt:
 700    case nir_op_ult:
 701       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
 702       break;
 703
 704    case nir_op_fge:
 705    case nir_op_ige:
 706    case nir_op_uge:
 707       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE);
 708       break;
 709
 710    case nir_op_feq:
 711    case nir_op_ieq:
 712       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z);
 713       break;
 714
 715    case nir_op_fne:
 716    case nir_op_ine:
 717       bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ);
 718       break;
 719
 720    case nir_op_inot:
 721       if (devinfo->gen >= 8) {
 722          op[0] = resolve_source_modifiers(op[0]);
 723       }
 724       bld.NOT(result, op[0]);
 725       break;
 726    case nir_op_ixor:
 727       if (devinfo->gen >= 8) {
 728          op[0] = resolve_source_modifiers(op[0]);
 729          op[1] = resolve_source_modifiers(op[1]);
 730       }
 731       bld.XOR(result, op[0], op[1]);
 732       break;
 733    case nir_op_ior:
 734       if (devinfo->gen >= 8) {
 735          op[0] = resolve_source_modifiers(op[0]);
 736          op[1] = resolve_source_modifiers(op[1]);
 737       }
 738       bld.OR(result, op[0], op[1]);
 739       break;
 740    case nir_op_iand:
 741       if (devinfo->gen >= 8) {
 742          op[0] = resolve_source_modifiers(op[0]);
 743          op[1] = resolve_source_modifiers(op[1]);
 744       }
 745       bld.AND(result, op[0], op[1]);
 746       break;
 747
 748    case nir_op_fdot2:
 749    case nir_op_fdot3:
 750    case nir_op_fdot4:
 751    case nir_op_bany2:
 752    case nir_op_bany3:
 753    case nir_op_bany4:
 754    case nir_op_ball2:
 755    case nir_op_ball3:
 756    case nir_op_ball4:
 757    case nir_op_ball_fequal2:
 758    case nir_op_ball_iequal2:
 759    case nir_op_ball_fequal3:
 760    case nir_op_ball_iequal3:
 761    case nir_op_ball_fequal4:
 762    case nir_op_ball_iequal4:
 763    case nir_op_bany_fnequal2:
 764    case nir_op_bany_inequal2:
 765    case nir_op_bany_fnequal3:
 766    case nir_op_bany_inequal3:
 767    case nir_op_bany_fnequal4:
 768    case nir_op_bany_inequal4:
 769       unreachable("Lowered by nir_lower_alu_reductions");
 770
 771    case nir_op_fnoise1_1:
 772    case nir_op_fnoise1_2:
 773    case nir_op_fnoise1_3:
 774    case nir_op_fnoise1_4:
 775    case nir_op_fnoise2_1:
 776    case nir_op_fnoise2_2:
 777    case nir_op_fnoise2_3:
 778    case nir_op_fnoise2_4:
 779    case nir_op_fnoise3_1:
 780    case nir_op_fnoise3_2:
 781    case nir_op_fnoise3_3:
 782    case nir_op_fnoise3_4:
 783    case nir_op_fnoise4_1:
 784    case nir_op_fnoise4_2:
 785    case nir_op_fnoise4_3:
 786    case nir_op_fnoise4_4:
 787       unreachable("not reached: should be handled by lower_noise");
 788
 789    case nir_op_ldexp:
 790       unreachable("not reached: should be handled by ldexp_to_arith()");
 791
 792    case nir_op_fsqrt:
 793       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
 794       inst->saturate = instr->dest.saturate;
 795       break;
 796
 797    case nir_op_frsq:
 798       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
 799       inst->saturate = instr->dest.saturate;
 800       break;
 801
 802    case nir_op_b2i:
 803    case nir_op_b2f:
 804       bld.MOV(result, negate(op[0]));
 805       break;
 806
 807    case nir_op_f2b:
 808       bld.CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
 809       break;
 810    case nir_op_i2b:
 811       bld.CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
 812       break;
 813
 814    case nir_op_ftrunc:
 815       inst = bld.RNDZ(result, op[0]);
 816       inst->saturate = instr->dest.saturate;
 817       break;
 818
 819    case nir_op_fceil: {
 820       op[0].negate = !op[0].negate;
 821       fs_reg temp = vgrf(glsl_type::float_type);
 822       bld.RNDD(temp, op[0]);
 823       temp.negate = true;
 824       inst = bld.MOV(result, temp);
 825       inst->saturate = instr->dest.saturate;
 826       break;
 827    }
 828    case nir_op_ffloor:
 829       inst = bld.RNDD(result, op[0]);
 830       inst->saturate = instr->dest.saturate;
 831       break;
 832    case nir_op_ffract:
 833       inst = bld.FRC(result, op[0]);
 834       inst->saturate = instr->dest.saturate;
 835       break;
 836    case nir_op_fround_even:
 837       inst = bld.RNDE(result, op[0]);
 838       inst->saturate = instr->dest.saturate;
 839       break;
 840
 841    case nir_op_fmin:
 842    case nir_op_imin:
 843    case nir_op_umin:
 844       if (devinfo->gen >= 6) {
 845          inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
 846          inst->conditional_mod = BRW_CONDITIONAL_L;
 847       } else {
 848          bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_L);
 849          inst = bld.SEL(result, op[0], op[1]);
 850          inst->predicate = BRW_PREDICATE_NORMAL;
 851       }
 852       inst->saturate = instr->dest.saturate;
 853       break;
 854
 855    case nir_op_fmax:
 856    case nir_op_imax:
 857    case nir_op_umax:
 858       if (devinfo->gen >= 6) {
 859          inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
 860          inst->conditional_mod = BRW_CONDITIONAL_GE;
 861       } else {
 862          bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_GE);
 863          inst = bld.SEL(result, op[0], op[1]);
 864          inst->predicate = BRW_PREDICATE_NORMAL;
 865       }
 866       inst->saturate = instr->dest.saturate;
 867       break;
 868
 869    case nir_op_pack_snorm_2x16:
 870    case nir_op_pack_snorm_4x8:
 871    case nir_op_pack_unorm_2x16:
 872    case nir_op_pack_unorm_4x8:
 873    case nir_op_unpack_snorm_2x16:
 874    case nir_op_unpack_snorm_4x8:
 875    case nir_op_unpack_unorm_2x16:
 876    case nir_op_unpack_unorm_4x8:
 877    case nir_op_unpack_half_2x16:
 878    case nir_op_pack_half_2x16:
 879       unreachable("not reached: should be handled by lower_packing_builtins");
 880
 881    case nir_op_unpack_half_2x16_split_x:
 882       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
 883       inst->saturate = instr->dest.saturate;
 884       break;
 885    case nir_op_unpack_half_2x16_split_y:
 886       inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
 887       inst->saturate = instr->dest.saturate;
 888       break;
 889
 890    case nir_op_fpow:
 891       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
 892       inst->saturate = instr->dest.saturate;
 893       break;
 894
 895    case nir_op_bitfield_reverse:
 896       bld.BFREV(result, op[0]);
 897       break;
 898
 899    case nir_op_bit_count:
 900       bld.CBIT(result, op[0]);
 901       break;
 902
 903    case nir_op_ufind_msb:
 904    case nir_op_ifind_msb: {
 905       bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
 906
 907       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
 908        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
 909        * subtract the result from 31 to convert the MSB count into an LSB count.
 910        */
 911       bld.CMP(bld.null_reg_d(), result, fs_reg(-1), BRW_CONDITIONAL_NZ);
 912
 913       inst = bld.ADD(result, result, fs_reg(31));
 914       inst->predicate = BRW_PREDICATE_NORMAL;
 915       inst->src[0].negate = true;
 916       break;
 917    }
 918
 919    case nir_op_find_lsb:
 920       bld.FBL(result, op[0]);
 921       break;
 922
 923    case nir_op_ubitfield_extract:
 924    case nir_op_ibitfield_extract:
 925       bld.BFE(result, op[2], op[1], op[0]);
 926       break;
 927    case nir_op_bfm:
 928       bld.BFI1(result, op[0], op[1]);
 929       break;
 930    case nir_op_bfi:
 931       bld.BFI2(result, op[0], op[1], op[2]);
 932       break;
 933
 934    case nir_op_bitfield_insert:
 935       unreachable("not reached: should be handled by "
 936                   "lower_instructions::bitfield_insert_to_bfm_bfi");
 937
 938    case nir_op_ishl:
 939       bld.SHL(result, op[0], op[1]);
 940       break;
 941    case nir_op_ishr:
 942       bld.ASR(result, op[0], op[1]);
 943       break;
 944    case nir_op_ushr:
 945       bld.SHR(result, op[0], op[1]);
 946       break;
 947
 948    case nir_op_pack_half_2x16_split:
 949       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
 950       break;
 951
 952    case nir_op_ffma:
 953       inst = bld.MAD(result, op[2], op[1], op[0]);
 954       inst->saturate = instr->dest.saturate;
 955       break;
 956
 957    case nir_op_flrp:
 958       inst = bld.LRP(result, op[0], op[1], op[2]);
 959       inst->saturate = instr->dest.saturate;
 960       break;
 961
 962    case nir_op_bcsel:
 963       if (optimize_frontfacing_ternary(instr, result))
 964          return;
 965
 966       bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
 967       inst = bld.SEL(result, op[1], op[2]);
 968       inst->predicate = BRW_PREDICATE_NORMAL;
 969       break;
 970
 971    default:
 972       unreachable("unhandled instruction");
 973    }
 974
 975    /* If we need to do a boolean resolve, replace the result with -(x & 1)
 976     * to sign extend the low bit to 0/~0
 977     */
 978    if (devinfo->gen <= 5 &&
 979        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
 980       fs_reg masked = vgrf(glsl_type::int_type);
 981       bld.AND(masked, result, fs_reg(1));
 982       masked.negate = true;
 983       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
 984    }
 985 }
 986
 987 void
 988 fs_visitor::nir_emit_load_const(const fs_builder &bld,
 989                                 nir_load_const_instr *instr)
 990 {
 991    fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
 992
 993    for (unsigned i = 0; i < instr->def.num_components; i++)
 994       bld.MOV(offset(reg, bld, i), fs_reg(instr->value.i[i]));
 995
 996    nir_ssa_values[instr->def.index] = reg;
 997 }
 998
 999 void
1000 fs_visitor::nir_emit_undef(const fs_builder &bld, nir_ssa_undef_instr *instr)
1001 {
1002    nir_ssa_values[instr->def.index] = bld.vgrf(BRW_REGISTER_TYPE_D,
1003                                                instr->def.num_components);
1004 }
1005
1006 static fs_reg
1007 fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
1008                    unsigned base_offset, nir_src *indirect)
1009 {
1010    fs_reg reg;
1011
1012    assert(!nir_reg->is_global);
1013
1014    reg = v->nir_locals[nir_reg->index];
1015
1016    reg = offset(reg, v->bld, base_offset * nir_reg->num_components);
1017    if (indirect) {
1018       int multiplier = nir_reg->num_components * (v->dispatch_width / 8);
1019
1020       reg.reladdr = new(v->mem_ctx) fs_reg(v->vgrf(glsl_type::int_type));
1021       v->bld.MUL(*reg.reladdr, v->get_nir_src(*indirect),
1022                  fs_reg(multiplier));
1023    }
1024
1025    return reg;
1026 }
1027
1028 fs_reg
1029 fs_visitor::get_nir_src(nir_src src)
1030 {
1031    fs_reg reg;
1032    if (src.is_ssa) {
1033       reg = nir_ssa_values[src.ssa->index];
1034    } else {
1035       reg = fs_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
1036                                src.reg.indirect);
1037    }
1038
1039    /* to avoid floating-point denorm flushing problems, set the type by
1040     * default to D - instructions that need floating point semantics will set
1041     * this to F if they need to
1042     */
1043    return retype(reg, BRW_REGISTER_TYPE_D);
1044 }
1045
1046 fs_reg
1047 fs_visitor::get_nir_dest(nir_dest dest)
1048 {
1049    if (dest.is_ssa) {
1050       nir_ssa_values[dest.ssa.index] = bld.vgrf(BRW_REGISTER_TYPE_F,
1051                                                 dest.ssa.num_components);
1052       return nir_ssa_values[dest.ssa.index];
1053    }
1054
1055    return fs_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
1056                              dest.reg.indirect);
1057 }
1058
1059 fs_reg
1060 fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
1061 {
1062    fs_reg image(UNIFORM, deref->var->data.driver_location,
1063                 BRW_REGISTER_TYPE_UD);
1064
1065    for (const nir_deref *tail = &deref->deref; tail->child;
1066         tail = tail->child) {
1067       const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
1068       assert(tail->child->deref_type == nir_deref_type_array);
1069       const unsigned size = glsl_get_length(tail->type);
1070       const unsigned element_size = type_size_scalar(deref_array->deref.type);
1071       const unsigned base = MIN2(deref_array->base_offset, size - 1);
1072       image = offset(image, bld, base * element_size);
1073
1074       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
1075          fs_reg tmp = vgrf(glsl_type::int_type);
1076
1077          if (devinfo->gen == 7 && !devinfo->is_haswell) {
1078             /* IVB hangs when trying to access an invalid surface index with
1079              * the dataport.  According to the spec "if the index used to
1080              * select an individual element is negative or greater than or
1081              * equal to the size of the array, the results of the operation
1082              * are undefined but may not lead to termination" -- which is one
1083              * of the possible outcomes of the hang.  Clamp the index to
1084              * prevent access outside of the array bounds.
1085              */
1086             bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
1087                                         BRW_REGISTER_TYPE_UD),
1088                             fs_reg(size - base - 1), BRW_CONDITIONAL_L);
1089          } else {
1090             bld.MOV(tmp, get_nir_src(deref_array->indirect));
1091          }
1092
1093          bld.MUL(tmp, tmp, fs_reg(element_size));
1094          if (image.reladdr)
1095             bld.ADD(*image.reladdr, *image.reladdr, tmp);
1096          else
1097             image.reladdr = new(mem_ctx) fs_reg(tmp);
1098       }
1099    }
1100
1101    return image;
1102 }
1103
1104 void
1105 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1106                          unsigned wr_mask)
1107 {
1108    for (unsigned i = 0; i < 4; i++) {
1109       if (!((wr_mask >> i) & 1))
1110          continue;
1111
1112       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1113       new_inst->dst = offset(new_inst->dst, bld, i);
1114       for (unsigned j = 0; j < new_inst->sources; j++)
1115          if (new_inst->src[j].file == GRF)
1116             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1117
1118       bld.emit(new_inst);
1119    }
1120 }
1121
1122 /**
1123  * Get the matching channel register datatype for an image intrinsic of the
1124  * specified GLSL image type.
1125  */
1126 static brw_reg_type
1127 get_image_base_type(const glsl_type *type)
1128 {
1129    switch ((glsl_base_type)type->sampler_type) {
1130    case GLSL_TYPE_UINT:
1131       return BRW_REGISTER_TYPE_UD;
1132    case GLSL_TYPE_INT:
1133       return BRW_REGISTER_TYPE_D;
1134    case GLSL_TYPE_FLOAT:
1135       return BRW_REGISTER_TYPE_F;
1136    default:
1137       unreachable("Not reached.");
1138    }
1139 }
1140
1141 /**
1142  * Get the appropriate atomic op for an image atomic intrinsic.
1143  */
1144 static unsigned
1145 get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
1146 {
1147    switch (op) {
1148    case nir_intrinsic_image_atomic_add:
1149       return BRW_AOP_ADD;
1150    case nir_intrinsic_image_atomic_min:
1151       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1152               BRW_AOP_IMIN : BRW_AOP_UMIN);
1153    case nir_intrinsic_image_atomic_max:
1154       return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
1155               BRW_AOP_IMAX : BRW_AOP_UMAX);
1156    case nir_intrinsic_image_atomic_and:
1157       return BRW_AOP_AND;
1158    case nir_intrinsic_image_atomic_or:
1159       return BRW_AOP_OR;
1160    case nir_intrinsic_image_atomic_xor:
1161       return BRW_AOP_XOR;
1162    case nir_intrinsic_image_atomic_exchange:
1163       return BRW_AOP_MOV;
1164    case nir_intrinsic_image_atomic_comp_swap:
1165       return BRW_AOP_CMPWR;
1166    default:
1167       unreachable("Not reachable.");
1168    }
1169 }
1170
1171 static fs_inst *
1172 emit_pixel_interpolater_send(const fs_builder &bld,
1173                              enum opcode opcode,
1174                              const fs_reg &dst,
1175                              const fs_reg &src,
1176                              const fs_reg &desc,
1177                              glsl_interp_qualifier interpolation)
1178 {
1179    fs_inst *inst;
1180    fs_reg payload;
1181    int mlen;
1182
1183    if (src.file == BAD_FILE) {
1184       /* Dummy payload */
1185       payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
1186       mlen = 1;
1187    } else {
1188       payload = src;
1189       mlen = 2 * bld.dispatch_width() / 8;
1190    }
1191
1192    inst = bld.emit(opcode, dst, payload, desc);
1193    inst->mlen = mlen;
1194    /* 2 floats per slot returned */
1195    inst->regs_written = 2 * bld.dispatch_width() / 8;
1196    inst->pi_noperspective = interpolation == INTERP_QUALIFIER_NOPERSPECTIVE;
1197
1198    return inst;
1199 }
1200
1201 /**
1202  * Computes 1 << x, given a D/UD register containing some value x.
1203  */
1204 static fs_reg
1205 intexp2(const fs_builder &bld, const fs_reg &x)
1206 {
1207    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1208
1209    fs_reg result = bld.vgrf(x.type, 1);
1210    fs_reg one = bld.vgrf(x.type, 1);
1211
1212    bld.MOV(one, retype(fs_reg(1), one.type));
1213    bld.SHL(result, one, x);
1214    return result;
1215 }
1216
1217 void
1218 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1219 {
1220    assert(stage == MESA_SHADER_GEOMETRY);
1221
1222    struct brw_gs_prog_data *gs_prog_data =
1223       (struct brw_gs_prog_data *) prog_data;
1224
1225    /* We can only do EndPrimitive() functionality when the control data
1226     * consists of cut bits.  Fortunately, the only time it isn't is when the
1227     * output type is points, in which case EndPrimitive() is a no-op.
1228     */
1229    if (gs_prog_data->control_data_format !=
1230        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1231       return;
1232    }
1233
1234    /* Cut bits use one bit per vertex. */
1235    assert(gs_compile->control_data_bits_per_vertex == 1);
1236
1237    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1238    vertex_count.type = BRW_REGISTER_TYPE_UD;
1239
1240    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1241     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1242     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1243     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1244     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1245     *
1246     * Note that if EndPrimitive() is called before emitting any vertices, this
1247     * will cause us to set bit 31 of the control_data_bits register to 1.
1248     * That's fine because:
1249     *
1250     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1251     *   output, so the hardware will ignore cut bit 31.
1252     *
1253     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1254     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1255     *   is automatically ended when the GS terminates).
1256     *
1257     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1258     *   control_data_bits register to 0 when the first vertex is emitted.
1259     */
1260
1261    const fs_builder abld = bld.annotate("end primitive");
1262
1263    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1264    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1265    abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
1266    fs_reg mask = intexp2(abld, prev_count);
1267    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1268     * attention to the lower 5 bits of its second source argument, so on this
1269     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1270     * ((vertex_count - 1) % 32).
1271     */
1272    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1273 }
1274
1275 void
1276 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1277 {
1278    assert(stage == MESA_SHADER_GEOMETRY);
1279    assert(gs_compile->control_data_bits_per_vertex != 0);
1280
1281    struct brw_gs_prog_data *gs_prog_data =
1282       (struct brw_gs_prog_data *) prog_data;
1283
1284    const fs_builder abld = bld.annotate("emit control data bits");
1285    const fs_builder fwa_bld = bld.exec_all();
1286
1287    /* We use a single UD register to accumulate control data bits (32 bits
1288     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1289     * at a time.
1290     *
1291     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1292     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1293     * use the Channel Mask phase to enable/disable which DWord within that
1294     * group to write.  (Remember, different SIMD8 channels may have emitted
1295     * different numbers of vertices, so we may need per-slot offsets.)
1296     *
1297     * Channel masking presents an annoying problem: we may have to replicate
1298     * the data up to 4 times:
1299     *
1300     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1301     *
1302     * To avoid penalizing shaders that emit a small number of vertices, we
1303     * can avoid these sometimes: if the size of the control data header is
1304     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1305     * land in the same 128-bit group, so we can skip per-slot offsets.
1306     *
1307     * Similarly, if the control data header is <= 32 bits, there is only one
1308     * DWord, so we can skip channel masks.
1309     */
1310    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1311
1312    fs_reg channel_mask, per_slot_offset;
1313
1314    if (gs_compile->control_data_header_size_bits > 32) {
1315       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1316       channel_mask = vgrf(glsl_type::uint_type);
1317    }
1318
1319    if (gs_compile->control_data_header_size_bits > 128) {
1320       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1321       per_slot_offset = vgrf(glsl_type::uint_type);
1322    }
1323
1324    /* Figure out which DWord we're trying to write to using the formula:
1325     *
1326     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1327     *
1328     * Since bits_per_vertex is a power of two, and is known at compile
1329     * time, this can be optimized to:
1330     *
1331     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1332     */
1333    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1334       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1335       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1336       abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu));
1337       unsigned log2_bits_per_vertex =
1338          _mesa_fls(gs_compile->control_data_bits_per_vertex);
1339       abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex));
1340
1341       if (per_slot_offset.file != BAD_FILE) {
1342          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1343           * the appropriate OWord within the control data header.
1344           */
1345          abld.SHR(per_slot_offset, dword_index, fs_reg(2u));
1346       }
1347
1348       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1349        * write to the appropriate DWORD within the OWORD.
1350        */
1351       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1352       fwa_bld.AND(channel, dword_index, fs_reg(3u));
1353       channel_mask = intexp2(fwa_bld, channel);
1354       /* Then the channel masks need to be in bits 23:16. */
1355       fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u));
1356    }
1357
1358    /* Store the control data bits in the message payload and send it. */
1359    int mlen = 2;
1360    if (channel_mask.file != BAD_FILE)
1361       mlen += 4; /* channel masks, plus 3 extra copies of the data */
1362    if (per_slot_offset.file != BAD_FILE)
1363       mlen++;
1364
1365    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1366    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1367    int i = 0;
1368    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1369    if (per_slot_offset.file != BAD_FILE)
1370       sources[i++] = per_slot_offset;
1371    if (channel_mask.file != BAD_FILE)
1372       sources[i++] = channel_mask;
1373    while (i < mlen) {
1374       sources[i++] = this->control_data_bits;
1375    }
1376
1377    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
1378    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
1379    inst->mlen = mlen;
1380    /* We need to increment Global Offset by 256-bits to make room for
1381     * Broadwell's extra "Vertex Count" payload at the beginning of the
1382     * URB entry.  Since this is an OWord message, Global Offset is counted
1383     * in 128-bit units, so we must set it to 2.
1384     */
1385    if (gs_prog_data->static_vertex_count == -1)
1386       inst->offset = 2;
1387 }
1388
1389 void
1390 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
1391                                             unsigned stream_id)
1392 {
1393    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
1394
1395    /* Note: we are calling this *before* increasing vertex_count, so
1396     * this->vertex_count == vertex_count - 1 in the formula above.
1397     */
1398
1399    /* Stream mode uses 2 bits per vertex */
1400    assert(gs_compile->control_data_bits_per_vertex == 2);
1401
1402    /* Must be a valid stream */
1403    assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
1404
1405    /* Control data bits are initialized to 0 so we don't have to set any
1406     * bits when sending vertices to stream 0.
1407     */
1408    if (stream_id == 0)
1409       return;
1410
1411    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
1412
1413    /* reg::sid = stream_id */
1414    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1415    abld.MOV(sid, fs_reg(stream_id));
1416
1417    /* reg:shift_count = 2 * (vertex_count - 1) */
1418    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1419    abld.SHL(shift_count, vertex_count, fs_reg(1u));
1420
1421    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1422     * attention to the lower 5 bits of its second source argument, so on this
1423     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
1424     * stream_id << ((2 * (vertex_count - 1)) % 32).
1425     */
1426    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1427    abld.SHL(mask, sid, shift_count);
1428    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1429 }
1430
1431 void
1432 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
1433                            unsigned stream_id)
1434 {
1435    assert(stage == MESA_SHADER_GEOMETRY);
1436
1437    struct brw_gs_prog_data *gs_prog_data =
1438       (struct brw_gs_prog_data *) prog_data;
1439
1440    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1441    vertex_count.type = BRW_REGISTER_TYPE_UD;
1442
1443    /* Haswell and later hardware ignores the "Render Stream Select" bits
1444     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
1445     * and instead sends all primitives down the pipeline for rasterization.
1446     * If the SOL stage is enabled, "Render Stream Select" is honored and
1447     * primitives bound to non-zero streams are discarded after stream output.
1448     *
1449     * Since the only purpose of primives sent to non-zero streams is to
1450     * be recorded by transform feedback, we can simply discard all geometry
1451     * bound to these streams when transform feedback is disabled.
1452     */
1453    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
1454       return;
1455
1456    /* If we're outputting 32 control data bits or less, then we can wait
1457     * until the shader is over to output them all.  Otherwise we need to
1458     * output them as we go.  Now is the time to do it, since we're about to
1459     * output the vertex_count'th vertex, so it's guaranteed that the
1460     * control data bits associated with the (vertex_count - 1)th vertex are
1461     * correct.
1462     */
1463    if (gs_compile->control_data_header_size_bits > 32) {
1464       const fs_builder abld =
1465          bld.annotate("emit vertex: emit control data bits");
1466
1467       /* Only emit control data bits if we've finished accumulating a batch
1468        * of 32 bits.  This is the case when:
1469        *
1470        *     (vertex_count * bits_per_vertex) % 32 == 0
1471        *
1472        * (in other words, when the last 5 bits of vertex_count *
1473        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
1474        * integer n (which is always the case, since bits_per_vertex is
1475        * always 1 or 2), this is equivalent to requiring that the last 5-n
1476        * bits of vertex_count are 0:
1477        *
1478        *     vertex_count & (2^(5-n) - 1) == 0
1479        *
1480        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
1481        * equivalent to:
1482        *
1483        *     vertex_count & (32 / bits_per_vertex - 1) == 0
1484        *
1485        * TODO: If vertex_count is an immediate, we could do some of this math
1486        *       at compile time...
1487        */
1488       fs_inst *inst =
1489          abld.AND(bld.null_reg_d(), vertex_count,
1490                   fs_reg(32u / gs_compile->control_data_bits_per_vertex - 1u));
1491       inst->conditional_mod = BRW_CONDITIONAL_Z;
1492
1493       abld.IF(BRW_PREDICATE_NORMAL);
1494       /* If vertex_count is 0, then no control data bits have been
1495        * accumulated yet, so we can skip emitting them.
1496        */
1497       abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u),
1498                BRW_CONDITIONAL_NEQ);
1499       abld.IF(BRW_PREDICATE_NORMAL);
1500       emit_gs_control_data_bits(vertex_count);
1501       abld.emit(BRW_OPCODE_ENDIF);
1502
1503       /* Reset control_data_bits to 0 so we can start accumulating a new
1504        * batch.
1505        *
1506        * Note: in the case where vertex_count == 0, this neutralizes the
1507        * effect of any call to EndPrimitive() that the shader may have
1508        * made before outputting its first vertex.
1509        */
1510       inst = abld.MOV(this->control_data_bits, fs_reg(0u));
1511       inst->force_writemask_all = true;
1512       abld.emit(BRW_OPCODE_ENDIF);
1513    }
1514
1515    emit_urb_writes(vertex_count);
1516
1517    /* In stream mode we have to set control data bits for all vertices
1518     * unless we have disabled control data bits completely (which we do
1519     * do for GL_POINTS outputs that don't use streams).
1520     */
1521    if (gs_compile->control_data_header_size_bits > 0 &&
1522        gs_prog_data->control_data_format ==
1523           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
1524       set_gs_stream_control_data_bits(vertex_count, stream_id);
1525    }
1526 }
1527
1528 void
1529 fs_visitor::emit_gs_input_load(const fs_reg &dst,
1530                                const nir_src &vertex_src,
1531                                unsigned input_offset,
1532                                unsigned num_components)
1533 {
1534    const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) prog_data;
1535    const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0];
1536
1537    const unsigned array_stride = vue_prog_data->urb_read_length * 8;
1538
1539    const bool pushed = 4 * input_offset < array_stride;
1540
1541    if (input_offset == 0) {
1542       /* This is the VUE header, containing VARYING_SLOT_LAYER [.y],
1543        * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].
1544        * Only gl_PointSize is available as a GS input, so they must
1545        * be asking for that input.
1546        */
1547       if (pushed) {
1548          bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type));
1549       } else {
1550          fs_reg tmp = bld.vgrf(dst.type, 4);
1551          fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
1552                                   fs_reg(vertex), fs_reg(0));
1553          inst->regs_written = 4;
1554          bld.MOV(dst, offset(tmp, bld, 3));
1555       }
1556    } else {
1557       if (pushed) {
1558          int index = vertex * array_stride + 4 * input_offset;
1559          for (unsigned i = 0; i < num_components; i++) {
1560             bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type));
1561          }
1562       } else {
1563          fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
1564                                   fs_reg(vertex), fs_reg(input_offset));
1565          inst->regs_written = num_components;
1566       }
1567    }
1568 }
1569
1570 void
1571 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
1572 {
1573    fs_reg dest;
1574    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
1575       dest = get_nir_dest(instr->dest);
1576
1577    bool has_indirect = false;
1578
1579    switch (instr->intrinsic) {
1580    case nir_intrinsic_discard:
1581    case nir_intrinsic_discard_if: {
1582       /* We track our discarded pixels in f0.1.  By predicating on it, we can
1583        * update just the flag bits that aren't yet discarded.  If there's no
1584        * condition, we emit a CMP of g0 != g0, so all currently executing
1585        * channels will get turned off.
1586        */
1587       fs_inst *cmp;
1588       if (instr->intrinsic == nir_intrinsic_discard_if) {
1589          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
1590                        fs_reg(0), BRW_CONDITIONAL_Z);
1591       } else {
1592          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1593                                        BRW_REGISTER_TYPE_UW));
1594          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
1595       }
1596       cmp->predicate = BRW_PREDICATE_NORMAL;
1597       cmp->flag_subreg = 1;
1598
1599       if (devinfo->gen >= 6) {
1600          emit_discard_jump();
1601       }
1602       break;
1603    }
1604
1605    case nir_intrinsic_atomic_counter_inc:
1606    case nir_intrinsic_atomic_counter_dec:
1607    case nir_intrinsic_atomic_counter_read: {
1608       using namespace surface_access;
1609
1610       /* Get the arguments of the atomic intrinsic. */
1611       const fs_reg offset = get_nir_src(instr->src[0]);
1612       const unsigned surface = (stage_prog_data->binding_table.abo_start +
1613                                 instr->const_index[0]);
1614       fs_reg tmp;
1615
1616       /* Emit a surface read or atomic op. */
1617       switch (instr->intrinsic) {
1618       case nir_intrinsic_atomic_counter_read:
1619          tmp = emit_untyped_read(bld, fs_reg(surface), offset, 1, 1);
1620          break;
1621
1622       case nir_intrinsic_atomic_counter_inc:
1623          tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
1624                                    fs_reg(), 1, 1, BRW_AOP_INC);
1625          break;
1626
1627       case nir_intrinsic_atomic_counter_dec:
1628          tmp = emit_untyped_atomic(bld, fs_reg(surface), offset, fs_reg(),
1629                                    fs_reg(), 1, 1, BRW_AOP_PREDEC);
1630          break;
1631
1632       default:
1633          unreachable("Unreachable");
1634       }
1635
1636       /* Assign the result. */
1637       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
1638
1639       /* Mark the surface as used. */
1640       brw_mark_surface_used(stage_prog_data, surface);
1641       break;
1642    }
1643
1644    case nir_intrinsic_image_load:
1645    case nir_intrinsic_image_store:
1646    case nir_intrinsic_image_atomic_add:
1647    case nir_intrinsic_image_atomic_min:
1648    case nir_intrinsic_image_atomic_max:
1649    case nir_intrinsic_image_atomic_and:
1650    case nir_intrinsic_image_atomic_or:
1651    case nir_intrinsic_image_atomic_xor:
1652    case nir_intrinsic_image_atomic_exchange:
1653    case nir_intrinsic_image_atomic_comp_swap: {
1654       using namespace image_access;
1655
1656       /* Get the referenced image variable and type. */
1657       const nir_variable *var = instr->variables[0]->var;
1658       const glsl_type *type = var->type->without_array();
1659       const brw_reg_type base_type = get_image_base_type(type);
1660
1661       /* Get some metadata from the image intrinsic. */
1662       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
1663       const unsigned arr_dims = type->sampler_array ? 1 : 0;
1664       const unsigned surf_dims = type->coordinate_components() - arr_dims;
1665       const mesa_format format =
1666          (var->data.image.write_only ? MESA_FORMAT_NONE :
1667           _mesa_get_shader_image_format(var->data.image.format));
1668
1669       /* Get the arguments of the image intrinsic. */
1670       const fs_reg image = get_nir_image_deref(instr->variables[0]);
1671       const fs_reg addr = retype(get_nir_src(instr->src[0]),
1672                                  BRW_REGISTER_TYPE_UD);
1673       const fs_reg src0 = (info->num_srcs >= 3 ?
1674                            retype(get_nir_src(instr->src[2]), base_type) :
1675                            fs_reg());
1676       const fs_reg src1 = (info->num_srcs >= 4 ?
1677                            retype(get_nir_src(instr->src[3]), base_type) :
1678                            fs_reg());
1679       fs_reg tmp;
1680
1681       /* Emit an image load, store or atomic op. */
1682       if (instr->intrinsic == nir_intrinsic_image_load)
1683          tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
1684
1685       else if (instr->intrinsic == nir_intrinsic_image_store)
1686          emit_image_store(bld, image, addr, src0, surf_dims, arr_dims, format);
1687
1688       else
1689          tmp = emit_image_atomic(bld, image, addr, src0, src1,
1690                                  surf_dims, arr_dims, info->dest_components,
1691                                  get_image_atomic_op(instr->intrinsic, type));
1692
1693       /* Assign the result. */
1694       for (unsigned c = 0; c < info->dest_components; ++c)
1695          bld.MOV(offset(retype(dest, base_type), bld, c),
1696                  offset(tmp, bld, c));
1697       break;
1698    }
1699
1700    case nir_intrinsic_memory_barrier: {
1701       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 16 / dispatch_width);
1702       bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
1703          ->regs_written = 2;
1704       break;
1705    }
1706
1707    case nir_intrinsic_shader_clock: {
1708       /* We cannot do anything if there is an event, so ignore it for now */
1709       fs_reg shader_clock = get_timestamp(bld);
1710       const fs_reg srcs[] = { shader_clock.set_smear(0), shader_clock.set_smear(1) };
1711
1712       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
1713       break;
1714    }
1715
1716    case nir_intrinsic_image_size: {
1717       /* Get the referenced image variable and type. */
1718       const nir_variable *var = instr->variables[0]->var;
1719       const glsl_type *type = var->type->without_array();
1720
1721       /* Get the size of the image. */
1722       const fs_reg image = get_nir_image_deref(instr->variables[0]);
1723       const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
1724
1725       /* For 1DArray image types, the array index is stored in the Z component.
1726        * Fix this by swizzling the Z component to the Y component.
1727        */
1728       const bool is_1d_array_image =
1729                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
1730                   type->sampler_array;
1731
1732       /* For CubeArray images, we should count the number of cubes instead
1733        * of the number of faces. Fix it by dividing the (Z component) by 6.
1734        */
1735       const bool is_cube_array_image =
1736                   type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
1737                   type->sampler_array;
1738
1739       /* Copy all the components. */
1740       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
1741       for (unsigned c = 0; c < info->dest_components; ++c) {
1742          if ((int)c >= type->coordinate_components()) {
1743              bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
1744                      fs_reg(1));
1745          } else if (c == 1 && is_1d_array_image) {
1746             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
1747                     offset(size, bld, 2));
1748          } else if (c == 2 && is_cube_array_image) {
1749             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
1750                      offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
1751                      offset(size, bld, c), fs_reg(6));
1752          } else {
1753             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
1754                     offset(size, bld, c));
1755          }
1756        }
1757
1758       break;
1759    }
1760
1761    case nir_intrinsic_image_samples:
1762       /* The driver does not support multi-sampled images. */
1763       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), fs_reg(1));
1764       break;
1765
1766    case nir_intrinsic_load_front_face:
1767       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
1768               *emit_frontfacing_interpolation());
1769       break;
1770
1771    case nir_intrinsic_load_vertex_id:
1772       unreachable("should be lowered by lower_vertex_id()");
1773
1774    case nir_intrinsic_load_primitive_id:
1775       assert(stage == MESA_SHADER_GEOMETRY);
1776       assert(((struct brw_gs_prog_data *)prog_data)->include_primitive_id);
1777       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
1778               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
1779       break;
1780
1781    case nir_intrinsic_load_vertex_id_zero_base:
1782    case nir_intrinsic_load_base_vertex:
1783    case nir_intrinsic_load_instance_id:
1784    case nir_intrinsic_load_invocation_id:
1785    case nir_intrinsic_load_sample_mask_in:
1786    case nir_intrinsic_load_sample_id: {
1787       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
1788       fs_reg val = nir_system_values[sv];
1789       assert(val.file != BAD_FILE);
1790       dest.type = val.type;
1791       bld.MOV(dest, val);
1792       break;
1793    }
1794
1795    case nir_intrinsic_load_sample_pos: {
1796       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
1797       assert(sample_pos.file != BAD_FILE);
1798       dest.type = sample_pos.type;
1799       bld.MOV(dest, sample_pos);
1800       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
1801       break;
1802    }
1803
1804    case nir_intrinsic_load_uniform_indirect:
1805       has_indirect = true;
1806       /* fallthrough */
1807    case nir_intrinsic_load_uniform: {
1808       fs_reg uniform_reg(UNIFORM, instr->const_index[0]);
1809       uniform_reg.reg_offset = instr->const_index[1];
1810
1811       for (unsigned j = 0; j < instr->num_components; j++) {
1812          fs_reg src = offset(retype(uniform_reg, dest.type), bld, j);
1813          if (has_indirect)
1814             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
1815
1816          bld.MOV(dest, src);
1817          dest = offset(dest, bld, 1);
1818       }
1819       break;
1820    }
1821
1822    case nir_intrinsic_load_ubo_indirect:
1823       has_indirect = true;
1824       /* fallthrough */
1825    case nir_intrinsic_load_ubo: {
1826       nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
1827       fs_reg surf_index;
1828
1829       if (const_index) {
1830          surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
1831                              const_index->u[0]);
1832       } else {
1833          /* The block index is not a constant. Evaluate the index expression
1834           * per-channel and add the base UBO index; we have to select a value
1835           * from any live channel.
1836           */
1837          surf_index = vgrf(glsl_type::uint_type);
1838          bld.ADD(surf_index, get_nir_src(instr->src[0]),
1839                  fs_reg(stage_prog_data->binding_table.ubo_start));
1840          surf_index = bld.emit_uniformize(surf_index);
1841
1842          /* Assume this may touch any UBO. It would be nice to provide
1843           * a tighter bound, but the array information is already lowered away.
1844           */
1845          brw_mark_surface_used(prog_data,
1846                                stage_prog_data->binding_table.ubo_start +
1847                                nir->info.num_ubos - 1);
1848       }
1849
1850       if (has_indirect) {
1851          /* Turn the byte offset into a dword offset. */
1852          fs_reg base_offset = vgrf(glsl_type::int_type);
1853          bld.SHR(base_offset, retype(get_nir_src(instr->src[1]),
1854                                      BRW_REGISTER_TYPE_D),
1855                  fs_reg(2));
1856
1857          unsigned vec4_offset = instr->const_index[0] / 4;
1858          for (int i = 0; i < instr->num_components; i++)
1859             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
1860                                        base_offset, vec4_offset + i);
1861       } else {
1862          fs_reg packed_consts = vgrf(glsl_type::float_type);
1863          packed_consts.type = dest.type;
1864
1865          fs_reg const_offset_reg((unsigned) instr->const_index[0] & ~15);
1866          bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
1867                   surf_index, const_offset_reg);
1868
1869          for (unsigned i = 0; i < instr->num_components; i++) {
1870             packed_consts.set_smear(instr->const_index[0] % 16 / 4 + i);
1871
1872             /* The std140 packing rules don't allow vectors to cross 16-byte
1873              * boundaries, and a reg is 32 bytes.
1874              */
1875             assert(packed_consts.subreg_offset < 32);
1876
1877             bld.MOV(dest, packed_consts);
1878             dest = offset(dest, bld, 1);
1879          }
1880       }
1881       break;
1882    }
1883
1884    case nir_intrinsic_load_ssbo_indirect:
1885       has_indirect = true;
1886       /* fallthrough */
1887    case nir_intrinsic_load_ssbo: {
1888       assert(devinfo->gen >= 7);
1889
1890       nir_const_value *const_uniform_block =
1891          nir_src_as_const_value(instr->src[0]);
1892
1893       fs_reg surf_index;
1894       if (const_uniform_block) {
1895          unsigned index = stage_prog_data->binding_table.ssbo_start +
1896                           const_uniform_block->u[0];
1897          surf_index = fs_reg(index);
1898          brw_mark_surface_used(prog_data, index);
1899       } else {
1900          surf_index = vgrf(glsl_type::uint_type);
1901          bld.ADD(surf_index, get_nir_src(instr->src[0]),
1902                  fs_reg(stage_prog_data->binding_table.ssbo_start));
1903
1904          /* Assume this may touch any UBO. It would be nice to provide
1905           * a tighter bound, but the array information is already lowered away.
1906           */
1907          brw_mark_surface_used(prog_data,
1908                                stage_prog_data->binding_table.ssbo_start +
1909                                nir->info.num_ssbos - 1);
1910       }
1911
1912       /* Get the offset to read from */
1913       fs_reg offset_reg;
1914       if (has_indirect) {
1915          offset_reg = get_nir_src(instr->src[1]);
1916       } else {
1917          offset_reg = fs_reg(instr->const_index[0]);
1918       }
1919
1920       /* Read the vector */
1921       fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
1922                                              1 /* dims */,
1923                                              instr->num_components,
1924                                              BRW_PREDICATE_NONE);
1925       read_result.type = dest.type;
1926       for (int i = 0; i < instr->num_components; i++)
1927          bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
1928
1929       break;
1930    }
1931
1932    case nir_intrinsic_load_input_indirect:
1933       has_indirect = true;
1934       /* fallthrough */
1935    case nir_intrinsic_load_input: {
1936       unsigned index = 0;
1937       for (unsigned j = 0; j < instr->num_components; j++) {
1938          fs_reg src;
1939          if (stage == MESA_SHADER_VERTEX) {
1940             src = offset(fs_reg(ATTR, instr->const_index[0], dest.type), bld, index);
1941          } else {
1942             src = offset(retype(nir_inputs, dest.type), bld,
1943                          instr->const_index[0] + index);
1944          }
1945          if (has_indirect)
1946             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
1947          index++;
1948
1949          bld.MOV(dest, src);
1950          dest = offset(dest, bld, 1);
1951       }
1952       break;
1953    }
1954
1955    case nir_intrinsic_load_per_vertex_input_indirect:
1956       assert(!"Not allowed");
1957       /* fallthrough */
1958    case nir_intrinsic_load_per_vertex_input:
1959       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
1960                          instr->num_components);
1961       break;
1962
1963    /* Handle ARB_gpu_shader5 interpolation intrinsics
1964     *
1965     * It's worth a quick word of explanation as to why we handle the full
1966     * variable-based interpolation intrinsic rather than a lowered version
1967     * with like we do for other inputs.  We have to do that because the way
1968     * we set up inputs doesn't allow us to use the already setup inputs for
1969     * interpolation.  At the beginning of the shader, we go through all of
1970     * the input variables and do the initial interpolation and put it in
1971     * the nir_inputs array based on its location as determined in
1972     * nir_lower_io.  If the input isn't used, dead code cleans up and
1973     * everything works fine.  However, when we get to the ARB_gpu_shader5
1974     * interpolation intrinsics, we need to reinterpolate the input
1975     * differently.  If we used an intrinsic that just had an index it would
1976     * only give us the offset into the nir_inputs array.  However, this is
1977     * useless because that value is post-interpolation and we need
1978     * pre-interpolation.  In order to get the actual location of the bits
1979     * we get from the vertex fetching hardware, we need the variable.
1980     */
1981    case nir_intrinsic_interp_var_at_centroid:
1982    case nir_intrinsic_interp_var_at_sample:
1983    case nir_intrinsic_interp_var_at_offset: {
1984       assert(stage == MESA_SHADER_FRAGMENT);
1985
1986       ((struct brw_wm_prog_data *) prog_data)->pulls_bary = true;
1987
1988       fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
1989       const glsl_interp_qualifier interpolation =
1990          (glsl_interp_qualifier) instr->variables[0]->var->data.interpolation;
1991
1992       switch (instr->intrinsic) {
1993       case nir_intrinsic_interp_var_at_centroid:
1994          emit_pixel_interpolater_send(bld,
1995                                       FS_OPCODE_INTERPOLATE_AT_CENTROID,
1996                                       dst_xy,
1997                                       fs_reg(), /* src */
1998                                       fs_reg(0u),
1999                                       interpolation);
2000          break;
2001
2002       case nir_intrinsic_interp_var_at_sample: {
2003          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
2004
2005          if (const_sample) {
2006             unsigned msg_data = const_sample->i[0] << 4;
2007
2008             emit_pixel_interpolater_send(bld,
2009                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
2010                                          dst_xy,
2011                                          fs_reg(), /* src */
2012                                          fs_reg(msg_data),
2013                                          interpolation);
2014          } else {
2015             const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
2016                                              BRW_REGISTER_TYPE_UD);
2017
2018             if (nir_src_is_dynamically_uniform(instr->src[0])) {
2019                const fs_reg sample_id = bld.emit_uniformize(sample_src);
2020                const fs_reg msg_data = vgrf(glsl_type::uint_type);
2021                bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
2022                emit_pixel_interpolater_send(bld,
2023                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
2024                                             dst_xy,
2025                                             fs_reg(), /* src */
2026                                             msg_data,
2027                                             interpolation);
2028             } else {
2029                /* Make a loop that sends a message to the pixel interpolater
2030                 * for the sample number in each live channel. If there are
2031                 * multiple channels with the same sample number then these
2032                 * will be handled simultaneously with a single interation of
2033                 * the loop.
2034                 */
2035                bld.emit(BRW_OPCODE_DO);
2036
2037                /* Get the next live sample number into sample_id_reg */
2038                const fs_reg sample_id = bld.emit_uniformize(sample_src);
2039
2040                /* Set the flag register so that we can perform the send
2041                 * message on all channels that have the same sample number
2042                 */
2043                bld.CMP(bld.null_reg_ud(),
2044                        sample_src, sample_id,
2045                        BRW_CONDITIONAL_EQ);
2046                const fs_reg msg_data = vgrf(glsl_type::uint_type);
2047                bld.exec_all().group(1, 0).SHL(msg_data, sample_id, fs_reg(4u));
2048                fs_inst *inst =
2049                   emit_pixel_interpolater_send(bld,
2050                                                FS_OPCODE_INTERPOLATE_AT_SAMPLE,
2051                                                dst_xy,
2052                                                fs_reg(), /* src */
2053                                                msg_data,
2054                                                interpolation);
2055                set_predicate(BRW_PREDICATE_NORMAL, inst);
2056
2057                /* Continue the loop if there are any live channels left */
2058                set_predicate_inv(BRW_PREDICATE_NORMAL,
2059                                  true, /* inverse */
2060                                  bld.emit(BRW_OPCODE_WHILE));
2061             }
2062          }
2063
2064          break;
2065       }
2066
2067       case nir_intrinsic_interp_var_at_offset: {
2068          nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
2069
2070          if (const_offset) {
2071             unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
2072             unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
2073
2074             emit_pixel_interpolater_send(bld,
2075                                          FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
2076                                          dst_xy,
2077                                          fs_reg(), /* src */
2078                                          fs_reg(off_x | (off_y << 4)),
2079                                          interpolation);
2080          } else {
2081             fs_reg src = vgrf(glsl_type::ivec2_type);
2082             fs_reg offset_src = retype(get_nir_src(instr->src[0]),
2083                                        BRW_REGISTER_TYPE_F);
2084             for (int i = 0; i < 2; i++) {
2085                fs_reg temp = vgrf(glsl_type::float_type);
2086                bld.MUL(temp, offset(offset_src, bld, i), fs_reg(16.0f));
2087                fs_reg itemp = vgrf(glsl_type::int_type);
2088                bld.MOV(itemp, temp);  /* float to int */
2089
2090                /* Clamp the upper end of the range to +7/16.
2091                 * ARB_gpu_shader5 requires that we support a maximum offset
2092                 * of +0.5, which isn't representable in a S0.4 value -- if
2093                 * we didn't clamp it, we'd end up with -8/16, which is the
2094                 * opposite of what the shader author wanted.
2095                 *
2096                 * This is legal due to ARB_gpu_shader5's quantization
2097                 * rules:
2098                 *
2099                 * "Not all values of <offset> may be supported; x and y
2100                 * offsets may be rounded to fixed-point values with the
2101                 * number of fraction bits given by the
2102                 * implementation-dependent constant
2103                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
2104                 */
2105                set_condmod(BRW_CONDITIONAL_L,
2106                            bld.SEL(offset(src, bld, i), itemp, fs_reg(7)));
2107             }
2108
2109             const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
2110             emit_pixel_interpolater_send(bld,
2111                                          opcode,
2112                                          dst_xy,
2113                                          src,
2114                                          fs_reg(0u),
2115                                          interpolation);
2116          }
2117          break;
2118       }
2119
2120       default:
2121          unreachable("Invalid intrinsic");
2122       }
2123
2124       for (unsigned j = 0; j < instr->num_components; j++) {
2125          fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
2126          src.type = dest.type;
2127
2128          bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
2129          dest = offset(dest, bld, 1);
2130       }
2131       break;
2132    }
2133
2134    case nir_intrinsic_store_ssbo_indirect:
2135       has_indirect = true;
2136       /* fallthrough */
2137    case nir_intrinsic_store_ssbo: {
2138       assert(devinfo->gen >= 7);
2139
2140       /* Block index */
2141       fs_reg surf_index;
2142       nir_const_value *const_uniform_block =
2143          nir_src_as_const_value(instr->src[1]);
2144       if (const_uniform_block) {
2145          unsigned index = stage_prog_data->binding_table.ssbo_start +
2146                           const_uniform_block->u[0];
2147          surf_index = fs_reg(index);
2148          brw_mark_surface_used(prog_data, index);
2149       } else {
2150          surf_index = vgrf(glsl_type::uint_type);
2151          bld.ADD(surf_index, get_nir_src(instr->src[1]),
2152                   fs_reg(stage_prog_data->binding_table.ssbo_start));
2153
2154          brw_mark_surface_used(prog_data,
2155                                stage_prog_data->binding_table.ssbo_start +
2156                                nir->info.num_ssbos - 1);
2157       }
2158
2159       /* Value */
2160       fs_reg val_reg = get_nir_src(instr->src[0]);
2161
2162       /* Writemask */
2163       unsigned writemask = instr->const_index[1];
2164
2165       /* Combine groups of consecutive enabled channels in one write
2166        * message. We use ffs to find the first enabled channel and then ffs on
2167        * the bit-inverse, down-shifted writemask to determine the length of
2168        * the block of enabled bits.
2169        */
2170       while (writemask) {
2171          unsigned first_component = ffs(writemask) - 1;
2172          unsigned length = ffs(~(writemask >> first_component)) - 1;
2173          fs_reg offset_reg;
2174
2175          if (!has_indirect) {
2176             offset_reg = fs_reg(instr->const_index[0] + 4 * first_component);
2177          } else {
2178             offset_reg = vgrf(glsl_type::uint_type);
2179             bld.ADD(offset_reg,
2180                     retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
2181                     fs_reg(4 * first_component));
2182          }
2183
2184          emit_untyped_write(bld, surf_index, offset_reg,
2185                             offset(val_reg, bld, first_component),
2186                             1 /* dims */, length,
2187                             BRW_PREDICATE_NONE);
2188
2189          /* Clear the bits in the writemask that we just wrote, then try
2190           * again to see if more channels are left.
2191           */
2192          writemask &= (15 << (first_component + length));
2193       }
2194       break;
2195    }
2196
2197    case nir_intrinsic_store_output_indirect:
2198       has_indirect = true;
2199       /* fallthrough */
2200    case nir_intrinsic_store_output: {
2201       fs_reg src = get_nir_src(instr->src[0]);
2202       unsigned index = 0;
2203       for (unsigned j = 0; j < instr->num_components; j++) {
2204          fs_reg new_dest = offset(retype(nir_outputs, src.type), bld,
2205                                   instr->const_index[0] + index);
2206          if (has_indirect)
2207             src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
2208          index++;
2209          bld.MOV(new_dest, src);
2210          src = offset(src, bld, 1);
2211       }
2212       break;
2213    }
2214
2215    case nir_intrinsic_barrier:
2216       emit_barrier();
2217       if (stage == MESA_SHADER_COMPUTE)
2218          ((struct brw_cs_prog_data *) prog_data)->uses_barrier = true;
2219       break;
2220
2221    case nir_intrinsic_load_local_invocation_id:
2222    case nir_intrinsic_load_work_group_id: {
2223       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
2224       fs_reg val = nir_system_values[sv];
2225       assert(val.file != BAD_FILE);
2226       dest.type = val.type;
2227       for (unsigned i = 0; i < 3; i++)
2228          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
2229       break;
2230    }
2231
2232    case nir_intrinsic_ssbo_atomic_add:
2233       nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
2234       break;
2235    case nir_intrinsic_ssbo_atomic_imin:
2236       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
2237       break;
2238    case nir_intrinsic_ssbo_atomic_umin:
2239       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
2240       break;
2241    case nir_intrinsic_ssbo_atomic_imax:
2242       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
2243       break;
2244    case nir_intrinsic_ssbo_atomic_umax:
2245       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
2246       break;
2247    case nir_intrinsic_ssbo_atomic_and:
2248       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
2249       break;
2250    case nir_intrinsic_ssbo_atomic_or:
2251       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
2252       break;
2253    case nir_intrinsic_ssbo_atomic_xor:
2254       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
2255       break;
2256    case nir_intrinsic_ssbo_atomic_exchange:
2257       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
2258       break;
2259    case nir_intrinsic_ssbo_atomic_comp_swap:
2260       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
2261       break;
2262
2263    case nir_intrinsic_get_buffer_size: {
2264       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
2265       unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
2266       int reg_width = dispatch_width / 8;
2267
2268       /* Set LOD = 0 */
2269       fs_reg source = fs_reg(0);
2270
2271       int mlen = 1 * reg_width;
2272       fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
2273                                   BRW_REGISTER_TYPE_UD);
2274       bld.LOAD_PAYLOAD(src_payload, &source, 1, 0);
2275
2276       fs_reg surf_index = fs_reg(prog_data->binding_table.ssbo_start + ssbo_index);
2277       fs_inst *inst = bld.emit(FS_OPCODE_GET_BUFFER_SIZE, dest,
2278                                src_payload, surf_index);
2279       inst->header_size = 0;
2280       inst->mlen = mlen;
2281       bld.emit(inst);
2282       break;
2283    }
2284
2285    case nir_intrinsic_load_num_work_groups: {
2286       assert(devinfo->gen >= 7);
2287       assert(stage == MESA_SHADER_COMPUTE);
2288
2289       struct brw_cs_prog_data *cs_prog_data =
2290          (struct brw_cs_prog_data *) prog_data;
2291       const unsigned surface =
2292          cs_prog_data->binding_table.work_groups_start;
2293
2294       cs_prog_data->uses_num_work_groups = true;
2295
2296       fs_reg surf_index = fs_reg(surface);
2297       brw_mark_surface_used(prog_data, surface);
2298
2299       /* Read the 3 GLuint components of gl_NumWorkGroups */
2300       for (unsigned i = 0; i < 3; i++) {
2301          fs_reg read_result =
2302             emit_untyped_read(bld, surf_index,
2303                               fs_reg(i << 2),
2304                               1 /* dims */, 1 /* size */,
2305                               BRW_PREDICATE_NONE);
2306          read_result.type = dest.type;
2307          bld.MOV(dest, read_result);
2308          dest = offset(dest, bld, 1);
2309       }
2310       break;
2311    }
2312
2313    case nir_intrinsic_emit_vertex_with_counter:
2314       emit_gs_vertex(instr->src[0], instr->const_index[0]);
2315       break;
2316
2317    case nir_intrinsic_end_primitive_with_counter:
2318       emit_gs_end_primitive(instr->src[0]);
2319       break;
2320
2321    case nir_intrinsic_set_vertex_count:
2322       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
2323       break;
2324
2325    default:
2326       unreachable("unknown intrinsic");
2327    }
2328 }
2329
2330 void
2331 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
2332                                  int op, nir_intrinsic_instr *instr)
2333 {
2334    fs_reg dest;
2335    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2336       dest = get_nir_dest(instr->dest);
2337
2338    fs_reg surface;
2339    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
2340    if (const_surface) {
2341       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
2342                             const_surface->u[0];
2343       surface = fs_reg(surf_index);
2344       brw_mark_surface_used(prog_data, surf_index);
2345    } else {
2346       surface = vgrf(glsl_type::uint_type);
2347       bld.ADD(surface, get_nir_src(instr->src[0]),
2348               fs_reg(stage_prog_data->binding_table.ssbo_start));
2349
2350       /* Assume this may touch any SSBO. This is the same we do for other
2351        * UBO/SSBO accesses with non-constant surface.
2352        */
2353       brw_mark_surface_used(prog_data,
2354                             stage_prog_data->binding_table.ssbo_start +
2355                             nir->info.num_ssbos - 1);
2356    }
2357
2358    fs_reg offset = get_nir_src(instr->src[1]);
2359    fs_reg data1 = get_nir_src(instr->src[2]);
2360    fs_reg data2;
2361    if (op == BRW_AOP_CMPWR)
2362       data2 = get_nir_src(instr->src[3]);
2363
2364    /* Emit the actual atomic operation operation */
2365
2366    fs_reg atomic_result =
2367       surface_access::emit_untyped_atomic(bld, surface, offset,
2368                                           data1, data2,
2369                                           1 /* dims */, 1 /* rsize */,
2370                                           op,
2371                                           BRW_PREDICATE_NONE);
2372    dest.type = atomic_result.type;
2373    bld.MOV(dest, atomic_result);
2374 }
2375
2376 void
2377 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
2378 {
2379    unsigned sampler = instr->sampler_index;
2380    fs_reg sampler_reg(sampler);
2381
2382    int gather_component = instr->component;
2383
2384    bool is_rect = instr->sampler_dim == GLSL_SAMPLER_DIM_RECT;
2385
2386    bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
2387                         instr->is_array;
2388
2389    int lod_components = 0;
2390    int UNUSED offset_components = 0;
2391
2392    fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
2393
2394    for (unsigned i = 0; i < instr->num_srcs; i++) {
2395       fs_reg src = get_nir_src(instr->src[i].src);
2396       switch (instr->src[i].src_type) {
2397       case nir_tex_src_bias:
2398          lod = retype(src, BRW_REGISTER_TYPE_F);
2399          break;
2400       case nir_tex_src_comparitor:
2401          shadow_comparitor = retype(src, BRW_REGISTER_TYPE_F);
2402          break;
2403       case nir_tex_src_coord:
2404          switch (instr->op) {
2405          case nir_texop_txf:
2406          case nir_texop_txf_ms:
2407             coordinate = retype(src, BRW_REGISTER_TYPE_D);
2408             break;
2409          default:
2410             coordinate = retype(src, BRW_REGISTER_TYPE_F);
2411             break;
2412          }
2413          break;
2414       case nir_tex_src_ddx:
2415          lod = retype(src, BRW_REGISTER_TYPE_F);
2416          lod_components = nir_tex_instr_src_size(instr, i);
2417          break;
2418       case nir_tex_src_ddy:
2419          lod2 = retype(src, BRW_REGISTER_TYPE_F);
2420          break;
2421       case nir_tex_src_lod:
2422          switch (instr->op) {
2423          case nir_texop_txs:
2424             lod = retype(src, BRW_REGISTER_TYPE_UD);
2425             break;
2426          case nir_texop_txf:
2427             lod = retype(src, BRW_REGISTER_TYPE_D);
2428             break;
2429          default:
2430             lod = retype(src, BRW_REGISTER_TYPE_F);
2431             break;
2432          }
2433          break;
2434       case nir_tex_src_ms_index:
2435          sample_index = retype(src, BRW_REGISTER_TYPE_UD);
2436          break;
2437       case nir_tex_src_offset:
2438          tex_offset = retype(src, BRW_REGISTER_TYPE_D);
2439          if (instr->is_array)
2440             offset_components = instr->coord_components - 1;
2441          else
2442             offset_components = instr->coord_components;
2443          break;
2444       case nir_tex_src_projector:
2445          unreachable("should be lowered");
2446
2447       case nir_tex_src_sampler_offset: {
2448          /* Figure out the highest possible sampler index and mark it as used */
2449          uint32_t max_used = sampler + instr->sampler_array_size - 1;
2450          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
2451             max_used += stage_prog_data->binding_table.gather_texture_start;
2452          } else {
2453             max_used += stage_prog_data->binding_table.texture_start;
2454          }
2455          brw_mark_surface_used(prog_data, max_used);
2456
2457          /* Emit code to evaluate the actual indexing expression */
2458          sampler_reg = vgrf(glsl_type::uint_type);
2459          bld.ADD(sampler_reg, src, fs_reg(sampler));
2460          sampler_reg = bld.emit_uniformize(sampler_reg);
2461          break;
2462       }
2463
2464       default:
2465          unreachable("unknown texture source");
2466       }
2467    }
2468
2469    if (instr->op == nir_texop_txf_ms) {
2470       if (devinfo->gen >= 7 &&
2471           key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
2472          mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg);
2473       } else {
2474          mcs = fs_reg(0u);
2475       }
2476    }
2477
2478    for (unsigned i = 0; i < 3; i++) {
2479       if (instr->const_offset[i] != 0) {
2480          assert(offset_components == 0);
2481          tex_offset = fs_reg(brw_texture_offset(instr->const_offset, 3));
2482          break;
2483       }
2484    }
2485
2486    enum glsl_base_type dest_base_type =
2487      brw_glsl_base_type_for_nir_type (instr->dest_type);
2488
2489    const glsl_type *dest_type =
2490       glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
2491                               1);
2492
2493    ir_texture_opcode op;
2494    switch (instr->op) {
2495    case nir_texop_lod: op = ir_lod; break;
2496    case nir_texop_query_levels: op = ir_query_levels; break;
2497    case nir_texop_tex: op = ir_tex; break;
2498    case nir_texop_tg4: op = ir_tg4; break;
2499    case nir_texop_txb: op = ir_txb; break;
2500    case nir_texop_txd: op = ir_txd; break;
2501    case nir_texop_txf: op = ir_txf; break;
2502    case nir_texop_txf_ms: op = ir_txf_ms; break;
2503    case nir_texop_txl: op = ir_txl; break;
2504    case nir_texop_txs: op = ir_txs; break;
2505    case nir_texop_texture_samples: {
2506       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
2507       fs_inst *inst = bld.emit(SHADER_OPCODE_SAMPLEINFO, dst,
2508                                bld.vgrf(BRW_REGISTER_TYPE_D, 1),
2509                                sampler_reg);
2510       inst->mlen = 1;
2511       inst->header_size = 1;
2512       inst->base_mrf = -1;
2513       return;
2514    }
2515    default:
2516       unreachable("unknown texture opcode");
2517    }
2518
2519    emit_texture(op, dest_type, coordinate, instr->coord_components,
2520                 shadow_comparitor, lod, lod2, lod_components, sample_index,
2521                 tex_offset, mcs, gather_component,
2522                 is_cube_array, is_rect, sampler, sampler_reg);
2523
2524    fs_reg dest = get_nir_dest(instr->dest);
2525    dest.type = this->result.type;
2526    unsigned num_components = nir_tex_instr_dest_size(instr);
2527    emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, bld.dispatch_width(),
2528                              dest, this->result),
2529                 (1 << num_components) - 1);
2530 }
2531
2532 void
2533 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
2534 {
2535    switch (instr->type) {
2536    case nir_jump_break:
2537       bld.emit(BRW_OPCODE_BREAK);
2538       break;
2539    case nir_jump_continue:
2540       bld.emit(BRW_OPCODE_CONTINUE);
2541       break;
2542    case nir_jump_return:
2543    default:
2544       unreachable("unknown jump");
2545    }
2546 }