src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <map>
  29
  30 #include "ac_shader_util.h"
  31 #include "aco_ir.h"
  32 #include "aco_builder.h"
  33 #include "aco_interface.h"
  34 #include "aco_instruction_selection_setup.cpp"
  35 #include "util/fast_idiv_by_const.h"
  36
  37 namespace aco {
  38 namespace {
  39
  40 class loop_info_RAII {
  41    isel_context* ctx;
  42    unsigned header_idx_old;
  43    Block* exit_old;
  44    bool divergent_cont_old;
  45    bool divergent_branch_old;
  46    bool divergent_if_old;
  47
  48 public:
  49    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  50       : ctx(ctx),
  51         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  52         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  53         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  54         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  55    {
  56       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  57       ctx->cf_info.parent_loop.exit = loop_exit;
  58       ctx->cf_info.parent_loop.has_divergent_continue = false;
  59       ctx->cf_info.parent_loop.has_divergent_branch = false;
  60       ctx->cf_info.parent_if.is_divergent = false;
  61       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  62    }
  63
  64    ~loop_info_RAII()
  65    {
  66       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  67       ctx->cf_info.parent_loop.exit = exit_old;
  68       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  69       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  70       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  71       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  72       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  73          ctx->cf_info.exec_potentially_empty = false;
  74    }
  75 };
  76
  77 struct if_context {
  78    Temp cond;
  79
  80    bool divergent_old;
  81    bool exec_potentially_empty_old;
  82
  83    unsigned BB_if_idx;
  84    unsigned invert_idx;
  85    bool then_branch_divergent;
  86    Block BB_invert;
  87    Block BB_endif;
  88 };
  89
  90 static void visit_cf_list(struct isel_context *ctx,
  91                           struct exec_list *list);
  92
  93 static void add_logical_edge(unsigned pred_idx, Block *succ)
  94 {
  95    succ->logical_preds.emplace_back(pred_idx);
  96 }
  97
  98
  99 static void add_linear_edge(unsigned pred_idx, Block *succ)
 100 {
 101    succ->linear_preds.emplace_back(pred_idx);
 102 }
 103
 104 static void add_edge(unsigned pred_idx, Block *succ)
 105 {
 106    add_logical_edge(pred_idx, succ);
 107    add_linear_edge(pred_idx, succ);
 108 }
 109
 110 static void append_logical_start(Block *b)
 111 {
 112    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 113 }
 114
 115 static void append_logical_end(Block *b)
 116 {
 117    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 118 }
 119
 120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 121 {
 122    assert(ctx->allocated[def->index].id());
 123    return ctx->allocated[def->index];
 124 }
 125
 126 Temp emit_mbcnt(isel_context *ctx, Definition dst,
 127                 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
 128 {
 129    Builder bld(ctx->program, ctx->block);
 130    Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
 131    Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
 132
 133    if (ctx->program->wave_size == 32) {
 134       return thread_id_lo;
 135    } else {
 136       Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
 137       return thread_id_hi;
 138    }
 139 }
 140
 141 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 142 {
 143    Builder bld(ctx->program, ctx->block);
 144
 145    if (!dst.id())
 146       dst = bld.tmp(src.regClass());
 147
 148    assert(src.size() == dst.size());
 149
 150    if (ctx->stage != fragment_fs) {
 151       if (!dst.id())
 152          return src;
 153
 154       bld.copy(Definition(dst), src);
 155       return dst;
 156    }
 157
 158    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 159    ctx->program->needs_wqm |= program_needs_wqm;
 160    return dst;
 161 }
 162
 163 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
 164 {
 165    if (index.regClass() == s1)
 166       return bld.readlane(bld.def(s1), data, index);
 167
 168    Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 169
 170    /* Currently not implemented on GFX6-7 */
 171    assert(ctx->options->chip_class >= GFX8);
 172
 173    if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
 174       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 175    }
 176
 177    /* GFX10, wave64 mode:
 178     * The bpermute instruction is limited to half-wave operation, which means that it can't
 179     * properly support subgroup shuffle like older generations (or wave32 mode), so we
 180     * emulate it here.
 181     */
 182    if (!ctx->has_gfx10_wave64_bpermute) {
 183       ctx->has_gfx10_wave64_bpermute = true;
 184       ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
 185       ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
 186    }
 187
 188    Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
 189    Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
 190    Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
 191    Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2, vcc), lane_is_hi, index_is_hi);
 192
 193    return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
 194                         bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
 195 }
 196
 197 Temp as_vgpr(isel_context *ctx, Temp val)
 198 {
 199    if (val.type() == RegType::sgpr) {
 200       Builder bld(ctx->program, ctx->block);
 201       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 202    }
 203    assert(val.type() == RegType::vgpr);
 204    return val;
 205 }
 206
 207 //assumes a != 0xffffffff
 208 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 209 {
 210    assert(b != 0);
 211    Builder bld(ctx->program, ctx->block);
 212
 213    if (util_is_power_of_two_or_zero(b)) {
 214       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 215       return;
 216    }
 217
 218    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 219
 220    assert(info.multiplier <= 0xffffffff);
 221
 222    bool pre_shift = info.pre_shift != 0;
 223    bool increment = info.increment != 0;
 224    bool multiply = true;
 225    bool post_shift = info.post_shift != 0;
 226
 227    if (!pre_shift && !increment && !multiply && !post_shift) {
 228       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 229       return;
 230    }
 231
 232    Temp pre_shift_dst = a;
 233    if (pre_shift) {
 234       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 235       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 236    }
 237
 238    Temp increment_dst = pre_shift_dst;
 239    if (increment) {
 240       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 241       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 242    }
 243
 244    Temp multiply_dst = increment_dst;
 245    if (multiply) {
 246       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 247       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 248                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 249    }
 250
 251    if (post_shift) {
 252       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 253    }
 254 }
 255
 256 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 257 {
 258    Builder bld(ctx->program, ctx->block);
 259    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 260 }
 261
 262
 263 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 264 {
 265    /* no need to extract the whole vector */
 266    if (src.regClass() == dst_rc) {
 267       assert(idx == 0);
 268       return src;
 269    }
 270    assert(src.size() > idx);
 271    Builder bld(ctx->program, ctx->block);
 272    auto it = ctx->allocated_vec.find(src.id());
 273    /* the size check needs to be early because elements other than 0 may be garbage */
 274    if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
 275       if (it->second[idx].regClass() == dst_rc) {
 276          return it->second[idx];
 277       } else {
 278          assert(dst_rc.size() == it->second[idx].regClass().size());
 279          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 280          return bld.copy(bld.def(dst_rc), it->second[idx]);
 281       }
 282    }
 283
 284    if (src.size() == dst_rc.size()) {
 285       assert(idx == 0);
 286       return bld.copy(bld.def(dst_rc), src);
 287    } else {
 288       Temp dst = bld.tmp(dst_rc);
 289       emit_extract_vector(ctx, src, idx, dst);
 290       return dst;
 291    }
 292 }
 293
 294 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 295 {
 296    if (num_components == 1)
 297       return;
 298    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 299       return;
 300    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 301    split->operands[0] = Operand(vec_src);
 302    std::array<Temp,4> elems;
 303    for (unsigned i = 0; i < num_components; i++) {
 304       elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
 305       split->definitions[i] = Definition(elems[i]);
 306    }
 307    ctx->block->instructions.emplace_back(std::move(split));
 308    ctx->allocated_vec.emplace(vec_src.id(), elems);
 309 }
 310
 311 /* This vector expansion uses a mask to determine which elements in the new vector
 312  * come from the original vector. The other elements are undefined. */
 313 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 314 {
 315    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 316
 317    if (vec_src == dst)
 318       return;
 319
 320    Builder bld(ctx->program, ctx->block);
 321    if (num_components == 1) {
 322       if (dst.type() == RegType::sgpr)
 323          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 324       else
 325          bld.copy(Definition(dst), vec_src);
 326       return;
 327    }
 328
 329    unsigned component_size = dst.size() / num_components;
 330    std::array<Temp,4> elems;
 331
 332    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 333    vec->definitions[0] = Definition(dst);
 334    unsigned k = 0;
 335    for (unsigned i = 0; i < num_components; i++) {
 336       if (mask & (1 << i)) {
 337          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 338          if (dst.type() == RegType::sgpr)
 339             src = bld.as_uniform(src);
 340          vec->operands[i] = Operand(src);
 341       } else {
 342          vec->operands[i] = Operand(0u);
 343       }
 344       elems[i] = vec->operands[i].getTemp();
 345    }
 346    ctx->block->instructions.emplace_back(std::move(vec));
 347    ctx->allocated_vec.emplace(dst.id(), elems);
 348 }
 349
 350 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
 351 {
 352    Builder bld(ctx->program, ctx->block);
 353    if (!dst.id())
 354       dst = bld.tmp(bld.lm);
 355
 356    assert(val.regClass() == s1);
 357    assert(dst.regClass() == bld.lm);
 358
 359    return bld.sop2(Builder::s_cselect, bld.hint_vcc(Definition(dst)), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
 360 }
 361
 362 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
 363 {
 364    Builder bld(ctx->program, ctx->block);
 365    if (!dst.id())
 366       dst = bld.tmp(s1);
 367
 368    assert(val.regClass() == bld.lm);
 369    assert(dst.regClass() == s1);
 370
 371    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 372    Temp tmp = bld.tmp(s1);
 373    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
 374    return emit_wqm(ctx, tmp, dst);
 375 }
 376
 377 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 378 {
 379    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 380       return get_ssa_temp(ctx, src.src.ssa);
 381
 382    if (src.src.ssa->num_components == size) {
 383       bool identity_swizzle = true;
 384       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 385          if (src.swizzle[i] != i)
 386             identity_swizzle = false;
 387       }
 388       if (identity_swizzle)
 389          return get_ssa_temp(ctx, src.src.ssa);
 390    }
 391
 392    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 393    unsigned elem_size = vec.size() / src.src.ssa->num_components;
 394    assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
 395    assert(vec.size() % elem_size == 0);
 396
 397    RegClass elem_rc = RegClass(vec.type(), elem_size);
 398    if (size == 1) {
 399       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 400    } else {
 401       assert(size <= 4);
 402       std::array<Temp,4> elems;
 403       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 404       for (unsigned i = 0; i < size; ++i) {
 405          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 406          vec_instr->operands[i] = Operand{elems[i]};
 407       }
 408       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
 409       vec_instr->definitions[0] = Definition(dst);
 410       ctx->block->instructions.emplace_back(std::move(vec_instr));
 411       ctx->allocated_vec.emplace(dst.id(), elems);
 412       return dst;
 413    }
 414 }
 415
 416 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 417 {
 418    if (ptr.size() == 2)
 419       return ptr;
 420    Builder bld(ctx->program, ctx->block);
 421    if (ptr.type() == RegType::vgpr)
 422       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 423    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 424                      ptr, Operand((unsigned)ctx->options->address32_hi));
 425 }
 426
 427 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 428 {
 429    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 430    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 431    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 432    sop2->definitions[0] = Definition(dst);
 433    if (writes_scc)
 434       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 435    ctx->block->instructions.emplace_back(std::move(sop2));
 436 }
 437
 438 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
 439 {
 440    Builder bld(ctx->program, ctx->block);
 441    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 442    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 443    if (src1.type() == RegType::sgpr) {
 444       if (commutative && src0.type() == RegType::vgpr) {
 445          Temp t = src0;
 446          src0 = src1;
 447          src1 = t;
 448       } else if (src0.type() == RegType::vgpr &&
 449                  op != aco_opcode::v_madmk_f32 &&
 450                  op != aco_opcode::v_madak_f32 &&
 451                  op != aco_opcode::v_madmk_f16 &&
 452                  op != aco_opcode::v_madak_f16) {
 453          /* If the instruction is not commutative, we emit a VOP3A instruction */
 454          bld.vop2_e64(op, Definition(dst), src0, src1);
 455          return;
 456       } else {
 457          src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
 458       }
 459    }
 460    bld.vop2(op, Definition(dst), src0, src1);
 461 }
 462
 463 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 464 {
 465    Temp src0 = get_alu_src(ctx, instr->src[0]);
 466    Temp src1 = get_alu_src(ctx, instr->src[1]);
 467    Temp src2 = get_alu_src(ctx, instr->src[2]);
 468
 469    /* ensure that the instruction has at most 1 sgpr operand
 470     * The optimizer will inline constants for us */
 471    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 472       src0 = as_vgpr(ctx, src0);
 473    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 474       src1 = as_vgpr(ctx, src1);
 475    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 476       src2 = as_vgpr(ctx, src2);
 477
 478    Builder bld(ctx->program, ctx->block);
 479    bld.vop3(op, Definition(dst), src0, src1, src2);
 480 }
 481
 482 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 483 {
 484    Builder bld(ctx->program, ctx->block);
 485    bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 486 }
 487
 488 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 489 {
 490    Temp src0 = get_alu_src(ctx, instr->src[0]);
 491    Temp src1 = get_alu_src(ctx, instr->src[1]);
 492    assert(src0.size() == src1.size());
 493
 494    aco_ptr<Instruction> vopc;
 495    if (src1.type() == RegType::sgpr) {
 496       if (src0.type() == RegType::vgpr) {
 497          /* to swap the operands, we might also have to change the opcode */
 498          switch (op) {
 499             case aco_opcode::v_cmp_lt_f32:
 500                op = aco_opcode::v_cmp_gt_f32;
 501                break;
 502             case aco_opcode::v_cmp_ge_f32:
 503                op = aco_opcode::v_cmp_le_f32;
 504                break;
 505             case aco_opcode::v_cmp_lt_i32:
 506                op = aco_opcode::v_cmp_gt_i32;
 507                break;
 508             case aco_opcode::v_cmp_ge_i32:
 509                op = aco_opcode::v_cmp_le_i32;
 510                break;
 511             case aco_opcode::v_cmp_lt_u32:
 512                op = aco_opcode::v_cmp_gt_u32;
 513                break;
 514             case aco_opcode::v_cmp_ge_u32:
 515                op = aco_opcode::v_cmp_le_u32;
 516                break;
 517             case aco_opcode::v_cmp_lt_f64:
 518                op = aco_opcode::v_cmp_gt_f64;
 519                break;
 520             case aco_opcode::v_cmp_ge_f64:
 521                op = aco_opcode::v_cmp_le_f64;
 522                break;
 523             case aco_opcode::v_cmp_lt_i64:
 524                op = aco_opcode::v_cmp_gt_i64;
 525                break;
 526             case aco_opcode::v_cmp_ge_i64:
 527                op = aco_opcode::v_cmp_le_i64;
 528                break;
 529             case aco_opcode::v_cmp_lt_u64:
 530                op = aco_opcode::v_cmp_gt_u64;
 531                break;
 532             case aco_opcode::v_cmp_ge_u64:
 533                op = aco_opcode::v_cmp_le_u64;
 534                break;
 535             default: /* eq and ne are commutative */
 536                break;
 537          }
 538          Temp t = src0;
 539          src0 = src1;
 540          src1 = t;
 541       } else {
 542          src1 = as_vgpr(ctx, src1);
 543       }
 544    }
 545
 546    Builder bld(ctx->program, ctx->block);
 547    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
 548 }
 549
 550 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 551 {
 552    Temp src0 = get_alu_src(ctx, instr->src[0]);
 553    Temp src1 = get_alu_src(ctx, instr->src[1]);
 554    Builder bld(ctx->program, ctx->block);
 555
 556    assert(dst.regClass() == bld.lm);
 557    assert(src0.type() == RegType::sgpr);
 558    assert(src1.type() == RegType::sgpr);
 559    assert(src0.regClass() == src1.regClass());
 560
 561    /* Emit the SALU comparison instruction */
 562    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
 563    /* Turn the result into a per-lane bool */
 564    bool_to_vector_condition(ctx, cmp, dst);
 565 }
 566
 567 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
 568                      aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::last_opcode, aco_opcode s64_op = aco_opcode::last_opcode)
 569 {
 570    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
 571    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
 572    bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
 573    bool use_valu = s_op == aco_opcode::last_opcode ||
 574                    divergent_vals ||
 575                    ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
 576                    ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
 577    aco_opcode op = use_valu ? v_op : s_op;
 578    assert(op != aco_opcode::last_opcode);
 579
 580    if (use_valu)
 581       emit_vopc_instruction(ctx, instr, op, dst);
 582    else
 583       emit_sopc_instruction(ctx, instr, op, dst);
 584 }
 585
 586 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
 587 {
 588    Builder bld(ctx->program, ctx->block);
 589    Temp src0 = get_alu_src(ctx, instr->src[0]);
 590    Temp src1 = get_alu_src(ctx, instr->src[1]);
 591
 592    assert(dst.regClass() == bld.lm);
 593    assert(src0.regClass() == bld.lm);
 594    assert(src1.regClass() == bld.lm);
 595
 596    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
 597 }
 598
 599 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 600 {
 601    Builder bld(ctx->program, ctx->block);
 602    Temp cond = get_alu_src(ctx, instr->src[0]);
 603    Temp then = get_alu_src(ctx, instr->src[1]);
 604    Temp els = get_alu_src(ctx, instr->src[2]);
 605
 606    assert(cond.regClass() == bld.lm);
 607
 608    if (dst.type() == RegType::vgpr) {
 609       aco_ptr<Instruction> bcsel;
 610       if (dst.size() == 1) {
 611          then = as_vgpr(ctx, then);
 612          els = as_vgpr(ctx, els);
 613
 614          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 615       } else if (dst.size() == 2) {
 616          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 617          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 618          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 619          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 620
 621          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 622          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 623
 624          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 625       } else {
 626          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 627          nir_print_instr(&instr->instr, stderr);
 628          fprintf(stderr, "\n");
 629       }
 630       return;
 631    }
 632
 633    if (instr->dest.dest.ssa.bit_size == 1) {
 634       assert(dst.regClass() == bld.lm);
 635       assert(then.regClass() == bld.lm);
 636       assert(els.regClass() == bld.lm);
 637    }
 638
 639    if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
 640       if (dst.regClass() == s1 || dst.regClass() == s2) {
 641          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 642          assert(dst.size() == then.size());
 643          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 644          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
 645       } else {
 646          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 647          nir_print_instr(&instr->instr, stderr);
 648          fprintf(stderr, "\n");
 649       }
 650       return;
 651    }
 652
 653    /* divergent boolean bcsel
 654     * this implements bcsel on bools: dst = s0 ? s1 : s2
 655     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 656    assert(instr->dest.dest.ssa.bit_size == 1);
 657
 658    if (cond.id() != then.id())
 659       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
 660
 661    if (cond.id() == els.id())
 662       bld.sop1(Builder::s_mov, Definition(dst), then);
 663    else
 664       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
 665                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
 666 }
 667
 668 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
 669                     aco_opcode op, uint32_t undo)
 670 {
 671    /* multiply by 16777216 to handle denormals */
 672    Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
 673                                as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
 674    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
 675    scaled = bld.vop1(op, bld.def(v1), scaled);
 676    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
 677
 678    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
 679
 680    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
 681 }
 682
 683 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 684 {
 685    if (ctx->block->fp_mode.denorm32 == 0) {
 686       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
 687       return;
 688    }
 689
 690    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
 691 }
 692
 693 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 694 {
 695    if (ctx->block->fp_mode.denorm32 == 0) {
 696       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
 697       return;
 698    }
 699
 700    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
 701 }
 702
 703 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 704 {
 705    if (ctx->block->fp_mode.denorm32 == 0) {
 706       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
 707       return;
 708    }
 709
 710    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
 711 }
 712
 713 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 714 {
 715    if (ctx->block->fp_mode.denorm32 == 0) {
 716       bld.vop1(aco_opcode::v_log_f32, dst, val);
 717       return;
 718    }
 719
 720    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
 721 }
 722
 723 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 724 {
 725    if (!instr->dest.dest.is_ssa) {
 726       fprintf(stderr, "nir alu dst not in ssa: ");
 727       nir_print_instr(&instr->instr, stderr);
 728       fprintf(stderr, "\n");
 729       abort();
 730    }
 731    Builder bld(ctx->program, ctx->block);
 732    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
 733    switch(instr->op) {
 734    case nir_op_vec2:
 735    case nir_op_vec3:
 736    case nir_op_vec4: {
 737       std::array<Temp,4> elems;
 738       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
 739       for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
 740          elems[i] = get_alu_src(ctx, instr->src[i]);
 741          vec->operands[i] = Operand{elems[i]};
 742       }
 743       vec->definitions[0] = Definition(dst);
 744       ctx->block->instructions.emplace_back(std::move(vec));
 745       ctx->allocated_vec.emplace(dst.id(), elems);
 746       break;
 747    }
 748    case nir_op_mov: {
 749       Temp src = get_alu_src(ctx, instr->src[0]);
 750       aco_ptr<Instruction> mov;
 751       if (dst.type() == RegType::sgpr) {
 752          if (src.type() == RegType::vgpr)
 753             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
 754          else if (src.regClass() == s1)
 755             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 756          else if (src.regClass() == s2)
 757             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
 758          else
 759             unreachable("wrong src register class for nir_op_imov");
 760       } else if (dst.regClass() == v1) {
 761          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
 762       } else if (dst.regClass() == v2) {
 763          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
 764       } else {
 765          nir_print_instr(&instr->instr, stderr);
 766          unreachable("Should have been lowered to scalar.");
 767       }
 768       break;
 769    }
 770    case nir_op_inot: {
 771       Temp src = get_alu_src(ctx, instr->src[0]);
 772       if (instr->dest.dest.ssa.bit_size == 1) {
 773          assert(src.regClass() == bld.lm);
 774          assert(dst.regClass() == bld.lm);
 775          bld.sop2(Builder::s_andn2, Definition(dst), bld.def(s1, scc), Operand(exec, bld.lm), src);
 776       } else if (dst.regClass() == v1) {
 777          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
 778       } else if (dst.type() == RegType::sgpr) {
 779          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
 780          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
 781       } else {
 782          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 783          nir_print_instr(&instr->instr, stderr);
 784          fprintf(stderr, "\n");
 785       }
 786       break;
 787    }
 788    case nir_op_ineg: {
 789       Temp src = get_alu_src(ctx, instr->src[0]);
 790       if (dst.regClass() == v1) {
 791          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
 792       } else if (dst.regClass() == s1) {
 793          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
 794       } else if (dst.size() == 2) {
 795          Temp src0 = bld.tmp(dst.type(), 1);
 796          Temp src1 = bld.tmp(dst.type(), 1);
 797          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
 798
 799          if (dst.regClass() == s2) {
 800             Temp carry = bld.tmp(s1);
 801             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
 802             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
 803             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 804          } else {
 805             Temp lower = bld.tmp(v1);
 806             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
 807             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
 808             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 809          }
 810       } else {
 811          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 812          nir_print_instr(&instr->instr, stderr);
 813          fprintf(stderr, "\n");
 814       }
 815       break;
 816    }
 817    case nir_op_iabs: {
 818       if (dst.regClass() == s1) {
 819          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
 820       } else if (dst.regClass() == v1) {
 821          Temp src = get_alu_src(ctx, instr->src[0]);
 822          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
 823       } else {
 824          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 825          nir_print_instr(&instr->instr, stderr);
 826          fprintf(stderr, "\n");
 827       }
 828       break;
 829    }
 830    case nir_op_isign: {
 831       Temp src = get_alu_src(ctx, instr->src[0]);
 832       if (dst.regClass() == s1) {
 833          Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
 834          Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
 835          bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
 836       } else if (dst.regClass() == s2) {
 837          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
 838          Temp neqz;
 839          if (ctx->program->chip_class >= GFX8)
 840             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
 841          else
 842             neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
 843          /* SCC gets zero-extended to 64 bit */
 844          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
 845       } else if (dst.regClass() == v1) {
 846          Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
 847          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
 848          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
 849       } else if (dst.regClass() == v2) {
 850          Temp upper = emit_extract_vector(ctx, src, 1, v1);
 851          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
 852          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
 853          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
 854          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
 855          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 856       } else {
 857          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 858          nir_print_instr(&instr->instr, stderr);
 859          fprintf(stderr, "\n");
 860       }
 861       break;
 862    }
 863    case nir_op_imax: {
 864       if (dst.regClass() == v1) {
 865          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
 866       } else if (dst.regClass() == s1) {
 867          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
 868       } else {
 869          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 870          nir_print_instr(&instr->instr, stderr);
 871          fprintf(stderr, "\n");
 872       }
 873       break;
 874    }
 875    case nir_op_umax: {
 876       if (dst.regClass() == v1) {
 877          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
 878       } else if (dst.regClass() == s1) {
 879          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
 880       } else {
 881          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 882          nir_print_instr(&instr->instr, stderr);
 883          fprintf(stderr, "\n");
 884       }
 885       break;
 886    }
 887    case nir_op_imin: {
 888       if (dst.regClass() == v1) {
 889          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
 890       } else if (dst.regClass() == s1) {
 891          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
 892       } else {
 893          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 894          nir_print_instr(&instr->instr, stderr);
 895          fprintf(stderr, "\n");
 896       }
 897       break;
 898    }
 899    case nir_op_umin: {
 900       if (dst.regClass() == v1) {
 901          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
 902       } else if (dst.regClass() == s1) {
 903          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
 904       } else {
 905          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 906          nir_print_instr(&instr->instr, stderr);
 907          fprintf(stderr, "\n");
 908       }
 909       break;
 910    }
 911    case nir_op_ior: {
 912       if (instr->dest.dest.ssa.bit_size == 1) {
 913          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
 914       } else if (dst.regClass() == v1) {
 915          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
 916       } else if (dst.regClass() == s1) {
 917          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
 918       } else if (dst.regClass() == s2) {
 919          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
 920       } else {
 921          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 922          nir_print_instr(&instr->instr, stderr);
 923          fprintf(stderr, "\n");
 924       }
 925       break;
 926    }
 927    case nir_op_iand: {
 928       if (instr->dest.dest.ssa.bit_size == 1) {
 929          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
 930       } else if (dst.regClass() == v1) {
 931          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
 932       } else if (dst.regClass() == s1) {
 933          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
 934       } else if (dst.regClass() == s2) {
 935          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
 936       } else {
 937          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 938          nir_print_instr(&instr->instr, stderr);
 939          fprintf(stderr, "\n");
 940       }
 941       break;
 942    }
 943    case nir_op_ixor: {
 944       if (instr->dest.dest.ssa.bit_size == 1) {
 945          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
 946       } else if (dst.regClass() == v1) {
 947          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
 948       } else if (dst.regClass() == s1) {
 949          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
 950       } else if (dst.regClass() == s2) {
 951          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
 952       } else {
 953          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 954          nir_print_instr(&instr->instr, stderr);
 955          fprintf(stderr, "\n");
 956       }
 957       break;
 958    }
 959    case nir_op_ushr: {
 960       if (dst.regClass() == v1) {
 961          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
 962       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
 963          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
 964                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 965       } else if (dst.regClass() == v2) {
 966          bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
 967                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
 968       } else if (dst.regClass() == s2) {
 969          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
 970       } else if (dst.regClass() == s1) {
 971          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
 972       } else {
 973          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 974          nir_print_instr(&instr->instr, stderr);
 975          fprintf(stderr, "\n");
 976       }
 977       break;
 978    }
 979    case nir_op_ishl: {
 980       if (dst.regClass() == v1) {
 981          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
 982       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
 983          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
 984                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 985       } else if (dst.regClass() == v2) {
 986          bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
 987                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
 988       } else if (dst.regClass() == s1) {
 989          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
 990       } else if (dst.regClass() == s2) {
 991          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
 992       } else {
 993          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 994          nir_print_instr(&instr->instr, stderr);
 995          fprintf(stderr, "\n");
 996       }
 997       break;
 998    }
 999    case nir_op_ishr: {
1000       if (dst.regClass() == v1) {
1001          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1002       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1003          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1004                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1005       } else if (dst.regClass() == v2) {
1006          bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1007                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1008       } else if (dst.regClass() == s1) {
1009          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1010       } else if (dst.regClass() == s2) {
1011          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1012       } else {
1013          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1014          nir_print_instr(&instr->instr, stderr);
1015          fprintf(stderr, "\n");
1016       }
1017       break;
1018    }
1019    case nir_op_find_lsb: {
1020       Temp src = get_alu_src(ctx, instr->src[0]);
1021       if (src.regClass() == s1) {
1022          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1023       } else if (src.regClass() == v1) {
1024          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1025       } else if (src.regClass() == s2) {
1026          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1027       } else {
1028          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1029          nir_print_instr(&instr->instr, stderr);
1030          fprintf(stderr, "\n");
1031       }
1032       break;
1033    }
1034    case nir_op_ufind_msb:
1035    case nir_op_ifind_msb: {
1036       Temp src = get_alu_src(ctx, instr->src[0]);
1037       if (src.regClass() == s1 || src.regClass() == s2) {
1038          aco_opcode op = src.regClass() == s2 ?
1039                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1040                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1041          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1042
1043          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1044                                         Operand(src.size() * 32u - 1u), msb_rev);
1045          Temp msb = sub.def(0).getTemp();
1046          Temp carry = sub.def(1).getTemp();
1047
1048          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
1049       } else if (src.regClass() == v1) {
1050          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1051          Temp msb_rev = bld.tmp(v1);
1052          emit_vop1_instruction(ctx, instr, op, msb_rev);
1053          Temp msb = bld.tmp(v1);
1054          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1055          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1056       } else {
1057          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1058          nir_print_instr(&instr->instr, stderr);
1059          fprintf(stderr, "\n");
1060       }
1061       break;
1062    }
1063    case nir_op_bitfield_reverse: {
1064       if (dst.regClass() == s1) {
1065          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1066       } else if (dst.regClass() == v1) {
1067          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1068       } else {
1069          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1070          nir_print_instr(&instr->instr, stderr);
1071          fprintf(stderr, "\n");
1072       }
1073       break;
1074    }
1075    case nir_op_iadd: {
1076       if (dst.regClass() == s1) {
1077          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1078          break;
1079       }
1080
1081       Temp src0 = get_alu_src(ctx, instr->src[0]);
1082       Temp src1 = get_alu_src(ctx, instr->src[1]);
1083       if (dst.regClass() == v1) {
1084          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1085          break;
1086       }
1087
1088       assert(src0.size() == 2 && src1.size() == 2);
1089       Temp src00 = bld.tmp(src0.type(), 1);
1090       Temp src01 = bld.tmp(dst.type(), 1);
1091       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1092       Temp src10 = bld.tmp(src1.type(), 1);
1093       Temp src11 = bld.tmp(dst.type(), 1);
1094       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1095
1096       if (dst.regClass() == s2) {
1097          Temp carry = bld.tmp(s1);
1098          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1099          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1100          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1101       } else if (dst.regClass() == v2) {
1102          Temp dst0 = bld.tmp(v1);
1103          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1104          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1105          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1106       } else {
1107          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1108          nir_print_instr(&instr->instr, stderr);
1109          fprintf(stderr, "\n");
1110       }
1111       break;
1112    }
1113    case nir_op_uadd_sat: {
1114       Temp src0 = get_alu_src(ctx, instr->src[0]);
1115       Temp src1 = get_alu_src(ctx, instr->src[1]);
1116       if (dst.regClass() == s1) {
1117          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1118          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1119                   src0, src1);
1120          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1121       } else if (dst.regClass() == v1) {
1122          if (ctx->options->chip_class >= GFX9) {
1123             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1124             add->operands[0] = Operand(src0);
1125             add->operands[1] = Operand(src1);
1126             add->definitions[0] = Definition(dst);
1127             add->clamp = 1;
1128             ctx->block->instructions.emplace_back(std::move(add));
1129          } else {
1130             if (src1.regClass() != v1)
1131                std::swap(src0, src1);
1132             assert(src1.regClass() == v1);
1133             Temp tmp = bld.tmp(v1);
1134             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1135             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1136          }
1137       } else {
1138          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1139          nir_print_instr(&instr->instr, stderr);
1140          fprintf(stderr, "\n");
1141       }
1142       break;
1143    }
1144    case nir_op_uadd_carry: {
1145       Temp src0 = get_alu_src(ctx, instr->src[0]);
1146       Temp src1 = get_alu_src(ctx, instr->src[1]);
1147       if (dst.regClass() == s1) {
1148          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1149          break;
1150       }
1151       if (dst.regClass() == v1) {
1152          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1153          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1154          break;
1155       }
1156
1157       Temp src00 = bld.tmp(src0.type(), 1);
1158       Temp src01 = bld.tmp(dst.type(), 1);
1159       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1160       Temp src10 = bld.tmp(src1.type(), 1);
1161       Temp src11 = bld.tmp(dst.type(), 1);
1162       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1163       if (dst.regClass() == s2) {
1164          Temp carry = bld.tmp(s1);
1165          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1166          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1167          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1168       } else if (dst.regClass() == v2) {
1169          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1170          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1171          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1172          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1173       } else {
1174          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1175          nir_print_instr(&instr->instr, stderr);
1176          fprintf(stderr, "\n");
1177       }
1178       break;
1179    }
1180    case nir_op_isub: {
1181       if (dst.regClass() == s1) {
1182          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1183          break;
1184       }
1185
1186       Temp src0 = get_alu_src(ctx, instr->src[0]);
1187       Temp src1 = get_alu_src(ctx, instr->src[1]);
1188       if (dst.regClass() == v1) {
1189          bld.vsub32(Definition(dst), src0, src1);
1190          break;
1191       }
1192
1193       Temp src00 = bld.tmp(src0.type(), 1);
1194       Temp src01 = bld.tmp(dst.type(), 1);
1195       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1196       Temp src10 = bld.tmp(src1.type(), 1);
1197       Temp src11 = bld.tmp(dst.type(), 1);
1198       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1199       if (dst.regClass() == s2) {
1200          Temp carry = bld.tmp(s1);
1201          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1202          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1203          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1204       } else if (dst.regClass() == v2) {
1205          Temp lower = bld.tmp(v1);
1206          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1207          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1208          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1209       } else {
1210          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1211          nir_print_instr(&instr->instr, stderr);
1212          fprintf(stderr, "\n");
1213       }
1214       break;
1215    }
1216    case nir_op_usub_borrow: {
1217       Temp src0 = get_alu_src(ctx, instr->src[0]);
1218       Temp src1 = get_alu_src(ctx, instr->src[1]);
1219       if (dst.regClass() == s1) {
1220          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1221          break;
1222       } else if (dst.regClass() == v1) {
1223          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1224          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1225          break;
1226       }
1227
1228       Temp src00 = bld.tmp(src0.type(), 1);
1229       Temp src01 = bld.tmp(dst.type(), 1);
1230       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1231       Temp src10 = bld.tmp(src1.type(), 1);
1232       Temp src11 = bld.tmp(dst.type(), 1);
1233       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1234       if (dst.regClass() == s2) {
1235          Temp borrow = bld.tmp(s1);
1236          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1237          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1238          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1239       } else if (dst.regClass() == v2) {
1240          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1241          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1242          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1243          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1244       } else {
1245          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1246          nir_print_instr(&instr->instr, stderr);
1247          fprintf(stderr, "\n");
1248       }
1249       break;
1250    }
1251    case nir_op_imul: {
1252       if (dst.regClass() == v1) {
1253          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1254                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1255       } else if (dst.regClass() == s1) {
1256          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1257       } else {
1258          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1259          nir_print_instr(&instr->instr, stderr);
1260          fprintf(stderr, "\n");
1261       }
1262       break;
1263    }
1264    case nir_op_umul_high: {
1265       if (dst.regClass() == v1) {
1266          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1267       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1268          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1269       } else if (dst.regClass() == s1) {
1270          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1271                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1272          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1273       } else {
1274          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1275          nir_print_instr(&instr->instr, stderr);
1276          fprintf(stderr, "\n");
1277       }
1278       break;
1279    }
1280    case nir_op_imul_high: {
1281       if (dst.regClass() == v1) {
1282          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1283       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1284          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1285       } else if (dst.regClass() == s1) {
1286          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1287                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1288          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1289       } else {
1290          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1291          nir_print_instr(&instr->instr, stderr);
1292          fprintf(stderr, "\n");
1293       }
1294       break;
1295    }
1296    case nir_op_fmul: {
1297       if (dst.size() == 1) {
1298          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1299       } else if (dst.size() == 2) {
1300          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1301                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1302       } else {
1303          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1304          nir_print_instr(&instr->instr, stderr);
1305          fprintf(stderr, "\n");
1306       }
1307       break;
1308    }
1309    case nir_op_fadd: {
1310       if (dst.size() == 1) {
1311          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1312       } else if (dst.size() == 2) {
1313          bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1314                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1315       } else {
1316          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1317          nir_print_instr(&instr->instr, stderr);
1318          fprintf(stderr, "\n");
1319       }
1320       break;
1321    }
1322    case nir_op_fsub: {
1323       Temp src0 = get_alu_src(ctx, instr->src[0]);
1324       Temp src1 = get_alu_src(ctx, instr->src[1]);
1325       if (dst.size() == 1) {
1326          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1327             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1328          else
1329             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1330       } else if (dst.size() == 2) {
1331          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1332                                      get_alu_src(ctx, instr->src[0]),
1333                                      as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1334          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1335          sub->neg[1] = true;
1336       } else {
1337          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1338          nir_print_instr(&instr->instr, stderr);
1339          fprintf(stderr, "\n");
1340       }
1341       break;
1342    }
1343    case nir_op_fmax: {
1344       if (dst.size() == 1) {
1345          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1346       } else if (dst.size() == 2) {
1347          bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1348                   get_alu_src(ctx, instr->src[0]),
1349                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1350       } else {
1351          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1352          nir_print_instr(&instr->instr, stderr);
1353          fprintf(stderr, "\n");
1354       }
1355       break;
1356    }
1357    case nir_op_fmin: {
1358       if (dst.size() == 1) {
1359          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1360       } else if (dst.size() == 2) {
1361          bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1362                   get_alu_src(ctx, instr->src[0]),
1363                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1364       } else {
1365          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1366          nir_print_instr(&instr->instr, stderr);
1367          fprintf(stderr, "\n");
1368       }
1369       break;
1370    }
1371    case nir_op_fmax3: {
1372       if (dst.size() == 1) {
1373          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1374       } else {
1375          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1376          nir_print_instr(&instr->instr, stderr);
1377          fprintf(stderr, "\n");
1378       }
1379       break;
1380    }
1381    case nir_op_fmin3: {
1382       if (dst.size() == 1) {
1383          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1384       } else {
1385          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1386          nir_print_instr(&instr->instr, stderr);
1387          fprintf(stderr, "\n");
1388       }
1389       break;
1390    }
1391    case nir_op_fmed3: {
1392       if (dst.size() == 1) {
1393          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1394       } else {
1395          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1396          nir_print_instr(&instr->instr, stderr);
1397          fprintf(stderr, "\n");
1398       }
1399       break;
1400    }
1401    case nir_op_umax3: {
1402       if (dst.size() == 1) {
1403          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1404       } else {
1405          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1406          nir_print_instr(&instr->instr, stderr);
1407          fprintf(stderr, "\n");
1408       }
1409       break;
1410    }
1411    case nir_op_umin3: {
1412       if (dst.size() == 1) {
1413          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1414       } else {
1415          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1416          nir_print_instr(&instr->instr, stderr);
1417          fprintf(stderr, "\n");
1418       }
1419       break;
1420    }
1421    case nir_op_umed3: {
1422       if (dst.size() == 1) {
1423          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1424       } else {
1425          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1426          nir_print_instr(&instr->instr, stderr);
1427          fprintf(stderr, "\n");
1428       }
1429       break;
1430    }
1431    case nir_op_imax3: {
1432       if (dst.size() == 1) {
1433          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1434       } else {
1435          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1436          nir_print_instr(&instr->instr, stderr);
1437          fprintf(stderr, "\n");
1438       }
1439       break;
1440    }
1441    case nir_op_imin3: {
1442       if (dst.size() == 1) {
1443          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1444       } else {
1445          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1446          nir_print_instr(&instr->instr, stderr);
1447          fprintf(stderr, "\n");
1448       }
1449       break;
1450    }
1451    case nir_op_imed3: {
1452       if (dst.size() == 1) {
1453          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1454       } else {
1455          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1456          nir_print_instr(&instr->instr, stderr);
1457          fprintf(stderr, "\n");
1458       }
1459       break;
1460    }
1461    case nir_op_cube_face_coord: {
1462       Temp in = get_alu_src(ctx, instr->src[0], 3);
1463       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1464                       emit_extract_vector(ctx, in, 1, v1),
1465                       emit_extract_vector(ctx, in, 2, v1) };
1466       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1467       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1468       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1469       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1470       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1471       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1472       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1473       break;
1474    }
1475    case nir_op_cube_face_index: {
1476       Temp in = get_alu_src(ctx, instr->src[0], 3);
1477       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1478                       emit_extract_vector(ctx, in, 1, v1),
1479                       emit_extract_vector(ctx, in, 2, v1) };
1480       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1481       break;
1482    }
1483    case nir_op_bcsel: {
1484       emit_bcsel(ctx, instr, dst);
1485       break;
1486    }
1487    case nir_op_frsq: {
1488       if (dst.size() == 1) {
1489          emit_rsq(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1490       } else if (dst.size() == 2) {
1491          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1492       } else {
1493          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1494          nir_print_instr(&instr->instr, stderr);
1495          fprintf(stderr, "\n");
1496       }
1497       break;
1498    }
1499    case nir_op_fneg: {
1500       Temp src = get_alu_src(ctx, instr->src[0]);
1501       if (dst.size() == 1) {
1502          if (ctx->block->fp_mode.must_flush_denorms32)
1503             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1504          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1505       } else if (dst.size() == 2) {
1506          if (ctx->block->fp_mode.must_flush_denorms16_64)
1507             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1508          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1509          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1510          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1511          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1512       } else {
1513          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1514          nir_print_instr(&instr->instr, stderr);
1515          fprintf(stderr, "\n");
1516       }
1517       break;
1518    }
1519    case nir_op_fabs: {
1520       Temp src = get_alu_src(ctx, instr->src[0]);
1521       if (dst.size() == 1) {
1522          if (ctx->block->fp_mode.must_flush_denorms32)
1523             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1524          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1525       } else if (dst.size() == 2) {
1526          if (ctx->block->fp_mode.must_flush_denorms16_64)
1527             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1528          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1529          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1530          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1531          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1532       } else {
1533          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1534          nir_print_instr(&instr->instr, stderr);
1535          fprintf(stderr, "\n");
1536       }
1537       break;
1538    }
1539    case nir_op_fsat: {
1540       Temp src = get_alu_src(ctx, instr->src[0]);
1541       if (dst.size() == 1) {
1542          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1543       } else if (dst.size() == 2) {
1544          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1545          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1546          vop3->clamp = true;
1547       } else {
1548          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1549          nir_print_instr(&instr->instr, stderr);
1550          fprintf(stderr, "\n");
1551       }
1552       break;
1553    }
1554    case nir_op_flog2: {
1555       if (dst.size() == 1) {
1556          emit_log2(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1557       } else {
1558          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1559          nir_print_instr(&instr->instr, stderr);
1560          fprintf(stderr, "\n");
1561       }
1562       break;
1563    }
1564    case nir_op_frcp: {
1565       if (dst.size() == 1) {
1566          emit_rcp(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1567       } else if (dst.size() == 2) {
1568          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1569       } else {
1570          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1571          nir_print_instr(&instr->instr, stderr);
1572          fprintf(stderr, "\n");
1573       }
1574       break;
1575    }
1576    case nir_op_fexp2: {
1577       if (dst.size() == 1) {
1578          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1579       } else {
1580          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1581          nir_print_instr(&instr->instr, stderr);
1582          fprintf(stderr, "\n");
1583       }
1584       break;
1585    }
1586    case nir_op_fsqrt: {
1587       if (dst.size() == 1) {
1588          emit_sqrt(ctx, bld, Definition(dst), get_alu_src(ctx, instr->src[0]));
1589       } else if (dst.size() == 2) {
1590          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1591       } else {
1592          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1593          nir_print_instr(&instr->instr, stderr);
1594          fprintf(stderr, "\n");
1595       }
1596       break;
1597    }
1598    case nir_op_ffract: {
1599       if (dst.size() == 1) {
1600          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1601       } else if (dst.size() == 2) {
1602          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1603       } else {
1604          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1605          nir_print_instr(&instr->instr, stderr);
1606          fprintf(stderr, "\n");
1607       }
1608       break;
1609    }
1610    case nir_op_ffloor: {
1611       if (dst.size() == 1) {
1612          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1613       } else if (dst.size() == 2) {
1614          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1615       } else {
1616          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1617          nir_print_instr(&instr->instr, stderr);
1618          fprintf(stderr, "\n");
1619       }
1620       break;
1621    }
1622    case nir_op_fceil: {
1623       if (dst.size() == 1) {
1624          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1625       } else if (dst.size() == 2) {
1626          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1627       } else {
1628          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1629          nir_print_instr(&instr->instr, stderr);
1630          fprintf(stderr, "\n");
1631       }
1632       break;
1633    }
1634    case nir_op_ftrunc: {
1635       if (dst.size() == 1) {
1636          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1637       } else if (dst.size() == 2) {
1638          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1639       } else {
1640          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1641          nir_print_instr(&instr->instr, stderr);
1642          fprintf(stderr, "\n");
1643       }
1644       break;
1645    }
1646    case nir_op_fround_even: {
1647       if (dst.size() == 1) {
1648          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1649       } else if (dst.size() == 2) {
1650          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1651       } else {
1652          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1653          nir_print_instr(&instr->instr, stderr);
1654          fprintf(stderr, "\n");
1655       }
1656       break;
1657    }
1658    case nir_op_fsin:
1659    case nir_op_fcos: {
1660       Temp src = get_alu_src(ctx, instr->src[0]);
1661       aco_ptr<Instruction> norm;
1662       if (dst.size() == 1) {
1663          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
1664          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, as_vgpr(ctx, src));
1665
1666          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1667          if (ctx->options->chip_class < GFX9)
1668             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1669
1670          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1671          bld.vop1(opcode, Definition(dst), tmp);
1672       } else {
1673          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1674          nir_print_instr(&instr->instr, stderr);
1675          fprintf(stderr, "\n");
1676       }
1677       break;
1678    }
1679    case nir_op_ldexp: {
1680       if (dst.size() == 1) {
1681          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1682                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1683                   get_alu_src(ctx, instr->src[1]));
1684       } else if (dst.size() == 2) {
1685          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1686                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1687                   get_alu_src(ctx, instr->src[1]));
1688       } else {
1689          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1690          nir_print_instr(&instr->instr, stderr);
1691          fprintf(stderr, "\n");
1692       }
1693       break;
1694    }
1695    case nir_op_frexp_sig: {
1696       if (dst.size() == 1) {
1697          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1698                   get_alu_src(ctx, instr->src[0]));
1699       } else if (dst.size() == 2) {
1700          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1701                   get_alu_src(ctx, instr->src[0]));
1702       } else {
1703          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1704          nir_print_instr(&instr->instr, stderr);
1705          fprintf(stderr, "\n");
1706       }
1707       break;
1708    }
1709    case nir_op_frexp_exp: {
1710       if (instr->src[0].src.ssa->bit_size == 32) {
1711          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1712                   get_alu_src(ctx, instr->src[0]));
1713       } else if (instr->src[0].src.ssa->bit_size == 64) {
1714          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1715                   get_alu_src(ctx, instr->src[0]));
1716       } else {
1717          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1718          nir_print_instr(&instr->instr, stderr);
1719          fprintf(stderr, "\n");
1720       }
1721       break;
1722    }
1723    case nir_op_fsign: {
1724       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1725       if (dst.size() == 1) {
1726          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1727          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1728          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1729          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1730       } else if (dst.size() == 2) {
1731          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1732          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1733          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
1734
1735          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1736          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1737          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1738
1739          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1740       } else {
1741          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1742          nir_print_instr(&instr->instr, stderr);
1743          fprintf(stderr, "\n");
1744       }
1745       break;
1746    }
1747    case nir_op_f2f32: {
1748       if (instr->src[0].src.ssa->bit_size == 64) {
1749          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1750       } else {
1751          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1752          nir_print_instr(&instr->instr, stderr);
1753          fprintf(stderr, "\n");
1754       }
1755       break;
1756    }
1757    case nir_op_f2f64: {
1758       if (instr->src[0].src.ssa->bit_size == 32) {
1759          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1760       } else {
1761          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1762          nir_print_instr(&instr->instr, stderr);
1763          fprintf(stderr, "\n");
1764       }
1765       break;
1766    }
1767    case nir_op_i2f32: {
1768       assert(dst.size() == 1);
1769       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1770       break;
1771    }
1772    case nir_op_i2f64: {
1773       if (instr->src[0].src.ssa->bit_size == 32) {
1774          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1775       } else if (instr->src[0].src.ssa->bit_size == 64) {
1776          Temp src = get_alu_src(ctx, instr->src[0]);
1777          RegClass rc = RegClass(src.type(), 1);
1778          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1779          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1780          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1781          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1782          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1783          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1784
1785       } else {
1786          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1787          nir_print_instr(&instr->instr, stderr);
1788          fprintf(stderr, "\n");
1789       }
1790       break;
1791    }
1792    case nir_op_u2f32: {
1793       assert(dst.size() == 1);
1794       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1795       break;
1796    }
1797    case nir_op_u2f64: {
1798       if (instr->src[0].src.ssa->bit_size == 32) {
1799          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1800       } else if (instr->src[0].src.ssa->bit_size == 64) {
1801          Temp src = get_alu_src(ctx, instr->src[0]);
1802          RegClass rc = RegClass(src.type(), 1);
1803          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1804          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1805          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1806          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1807          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1808          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1809       } else {
1810          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1811          nir_print_instr(&instr->instr, stderr);
1812          fprintf(stderr, "\n");
1813       }
1814       break;
1815    }
1816    case nir_op_f2i32: {
1817       Temp src = get_alu_src(ctx, instr->src[0]);
1818       if (instr->src[0].src.ssa->bit_size == 32) {
1819          if (dst.type() == RegType::vgpr)
1820             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1821          else
1822             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1823                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1824
1825       } else if (instr->src[0].src.ssa->bit_size == 64) {
1826          if (dst.type() == RegType::vgpr)
1827             bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1828          else
1829             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1830                        bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1831
1832       } else {
1833          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1834          nir_print_instr(&instr->instr, stderr);
1835          fprintf(stderr, "\n");
1836       }
1837       break;
1838    }
1839    case nir_op_f2u32: {
1840       Temp src = get_alu_src(ctx, instr->src[0]);
1841       if (instr->src[0].src.ssa->bit_size == 32) {
1842          if (dst.type() == RegType::vgpr)
1843             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1844          else
1845             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1846                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1847
1848       } else if (instr->src[0].src.ssa->bit_size == 64) {
1849          if (dst.type() == RegType::vgpr)
1850             bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1851          else
1852             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1853                        bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1854
1855       } else {
1856          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1857          nir_print_instr(&instr->instr, stderr);
1858          fprintf(stderr, "\n");
1859       }
1860       break;
1861    }
1862    case nir_op_f2i64: {
1863       Temp src = get_alu_src(ctx, instr->src[0]);
1864       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1865          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1866          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1867          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1868          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1869          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1870          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1871          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1872          Temp new_exponent = bld.tmp(v1);
1873          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1874          if (ctx->program->chip_class >= GFX8)
1875             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1876          else
1877             mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
1878          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1879          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1880          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1881          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1882          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1883          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1884          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1885          Temp new_lower = bld.tmp(v1);
1886          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1887          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1888          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1889
1890       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1891          if (src.type() == RegType::vgpr)
1892             src = bld.as_uniform(src);
1893          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1894          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1895          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1896          exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1897          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1898          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1899          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1900          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1901          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1902          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1903          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1904          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1905          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1906          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1907          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1908          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1909          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1910          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1911          Temp borrow = bld.tmp(s1);
1912          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1913          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1914          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1915
1916       } else if (instr->src[0].src.ssa->bit_size == 64) {
1917          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1918          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1919          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1920          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1921          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1922          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1923          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1924          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1925          if (dst.type() == RegType::sgpr) {
1926             lower = bld.as_uniform(lower);
1927             upper = bld.as_uniform(upper);
1928          }
1929          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1930
1931       } else {
1932          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1933          nir_print_instr(&instr->instr, stderr);
1934          fprintf(stderr, "\n");
1935       }
1936       break;
1937    }
1938    case nir_op_f2u64: {
1939       Temp src = get_alu_src(ctx, instr->src[0]);
1940       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1941          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1942          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
1943          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1944          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1945          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1946          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1947          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1948          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1949          Temp new_exponent = bld.tmp(v1);
1950          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1951          if (ctx->program->chip_class >= GFX8)
1952             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1953          else
1954             mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
1955          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1956          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1957          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1958          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1959          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1960          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1961          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1962
1963       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1964          if (src.type() == RegType::vgpr)
1965             src = bld.as_uniform(src);
1966          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1967          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1968          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1969          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1970          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1971          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1972          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1973          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1974          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1975          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1976          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1977          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1978          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1979          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1980          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1981          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1982          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1983          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1984
1985       } else if (instr->src[0].src.ssa->bit_size == 64) {
1986          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1987          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1988          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1989          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1990          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1991          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1992          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1993          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1994          if (dst.type() == RegType::sgpr) {
1995             lower = bld.as_uniform(lower);
1996             upper = bld.as_uniform(upper);
1997          }
1998          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1999
2000       } else {
2001          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2002          nir_print_instr(&instr->instr, stderr);
2003          fprintf(stderr, "\n");
2004       }
2005       break;
2006    }
2007    case nir_op_b2f32: {
2008       Temp src = get_alu_src(ctx, instr->src[0]);
2009       assert(src.regClass() == bld.lm);
2010
2011       if (dst.regClass() == s1) {
2012          src = bool_to_scalar_condition(ctx, src);
2013          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2014       } else if (dst.regClass() == v1) {
2015          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2016       } else {
2017          unreachable("Wrong destination register class for nir_op_b2f32.");
2018       }
2019       break;
2020    }
2021    case nir_op_b2f64: {
2022       Temp src = get_alu_src(ctx, instr->src[0]);
2023       assert(src.regClass() == bld.lm);
2024
2025       if (dst.regClass() == s2) {
2026          src = bool_to_scalar_condition(ctx, src);
2027          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2028       } else if (dst.regClass() == v2) {
2029          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2030          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2031          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2032       } else {
2033          unreachable("Wrong destination register class for nir_op_b2f64.");
2034       }
2035       break;
2036    }
2037    case nir_op_i2i32: {
2038       Temp src = get_alu_src(ctx, instr->src[0]);
2039       if (instr->src[0].src.ssa->bit_size == 64) {
2040          /* we can actually just say dst = src, as it would map the lower register */
2041          emit_extract_vector(ctx, src, 0, dst);
2042       } else {
2043          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2044          nir_print_instr(&instr->instr, stderr);
2045          fprintf(stderr, "\n");
2046       }
2047       break;
2048    }
2049    case nir_op_u2u32: {
2050       Temp src = get_alu_src(ctx, instr->src[0]);
2051       if (instr->src[0].src.ssa->bit_size == 16) {
2052          if (dst.regClass() == s1) {
2053             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
2054          } else {
2055             // TODO: do better with SDWA
2056             bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
2057          }
2058       } else if (instr->src[0].src.ssa->bit_size == 64) {
2059          /* we can actually just say dst = src, as it would map the lower register */
2060          emit_extract_vector(ctx, src, 0, dst);
2061       } else {
2062          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2063          nir_print_instr(&instr->instr, stderr);
2064          fprintf(stderr, "\n");
2065       }
2066       break;
2067    }
2068    case nir_op_i2i64: {
2069       Temp src = get_alu_src(ctx, instr->src[0]);
2070       if (src.regClass() == s1) {
2071          Temp high = bld.sopc(aco_opcode::s_ashr_i32, bld.def(s1, scc), src, Operand(31u));
2072          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2073       } else if (src.regClass() == v1) {
2074          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2075          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2076       } else {
2077          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2078          nir_print_instr(&instr->instr, stderr);
2079          fprintf(stderr, "\n");
2080       }
2081       break;
2082    }
2083    case nir_op_u2u64: {
2084       Temp src = get_alu_src(ctx, instr->src[0]);
2085       if (instr->src[0].src.ssa->bit_size == 32) {
2086          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
2087       } else {
2088          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2089          nir_print_instr(&instr->instr, stderr);
2090          fprintf(stderr, "\n");
2091       }
2092       break;
2093    }
2094    case nir_op_b2i32: {
2095       Temp src = get_alu_src(ctx, instr->src[0]);
2096       assert(src.regClass() == bld.lm);
2097
2098       if (dst.regClass() == s1) {
2099          // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2100          bool_to_scalar_condition(ctx, src, dst);
2101       } else if (dst.regClass() == v1) {
2102          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2103       } else {
2104          unreachable("Invalid register class for b2i32");
2105       }
2106       break;
2107    }
2108    case nir_op_i2b1: {
2109       Temp src = get_alu_src(ctx, instr->src[0]);
2110       assert(dst.regClass() == bld.lm);
2111
2112       if (src.type() == RegType::vgpr) {
2113          assert(src.regClass() == v1 || src.regClass() == v2);
2114          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2115                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2116       } else {
2117          assert(src.regClass() == s1 || src.regClass() == s2);
2118          Temp tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2119                              bld.scc(bld.def(s1)), Operand(0u), src);
2120          bool_to_vector_condition(ctx, tmp, dst);
2121       }
2122       break;
2123    }
2124    case nir_op_pack_64_2x32_split: {
2125       Temp src0 = get_alu_src(ctx, instr->src[0]);
2126       Temp src1 = get_alu_src(ctx, instr->src[1]);
2127
2128       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2129       break;
2130    }
2131    case nir_op_unpack_64_2x32_split_x:
2132       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2133       break;
2134    case nir_op_unpack_64_2x32_split_y:
2135       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2136       break;
2137    case nir_op_pack_half_2x16: {
2138       Temp src = get_alu_src(ctx, instr->src[0], 2);
2139
2140       if (dst.regClass() == v1) {
2141          Temp src0 = bld.tmp(v1);
2142          Temp src1 = bld.tmp(v1);
2143          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2144          if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2145             bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2146          else
2147             bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2148                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2149                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2150       } else {
2151          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2152          nir_print_instr(&instr->instr, stderr);
2153          fprintf(stderr, "\n");
2154       }
2155       break;
2156    }
2157    case nir_op_unpack_half_2x16_split_x: {
2158       if (dst.regClass() == v1) {
2159          Builder bld(ctx->program, ctx->block);
2160          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2161       } else {
2162          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2163          nir_print_instr(&instr->instr, stderr);
2164          fprintf(stderr, "\n");
2165       }
2166       break;
2167    }
2168    case nir_op_unpack_half_2x16_split_y: {
2169       if (dst.regClass() == v1) {
2170          Builder bld(ctx->program, ctx->block);
2171          /* TODO: use SDWA here */
2172          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2173                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2174       } else {
2175          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2176          nir_print_instr(&instr->instr, stderr);
2177          fprintf(stderr, "\n");
2178       }
2179       break;
2180    }
2181    case nir_op_fquantize2f16: {
2182       Temp src = get_alu_src(ctx, instr->src[0]);
2183       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2184       Temp f32, cmp_res;
2185
2186       if (ctx->program->chip_class >= GFX8) {
2187          Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2188          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2189          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2190       } else {
2191          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2192           * so compare the result and flush to 0 if it's smaller.
2193           */
2194          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2195          Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2196          Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), f32, smallest);
2197          static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2198          cmp_res = vop3->definitions[0].getTemp();
2199       }
2200
2201       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2202          Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2203          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2204       } else {
2205          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2206       }
2207       break;
2208    }
2209    case nir_op_bfm: {
2210       Temp bits = get_alu_src(ctx, instr->src[0]);
2211       Temp offset = get_alu_src(ctx, instr->src[1]);
2212
2213       if (dst.regClass() == s1) {
2214          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2215       } else if (dst.regClass() == v1) {
2216          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2217       } else {
2218          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2219          nir_print_instr(&instr->instr, stderr);
2220          fprintf(stderr, "\n");
2221       }
2222       break;
2223    }
2224    case nir_op_bitfield_select: {
2225       /* (mask & insert) | (~mask & base) */
2226       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2227       Temp insert = get_alu_src(ctx, instr->src[1]);
2228       Temp base = get_alu_src(ctx, instr->src[2]);
2229
2230       /* dst = (insert & bitmask) | (base & ~bitmask) */
2231       if (dst.regClass() == s1) {
2232          aco_ptr<Instruction> sop2;
2233          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2234          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2235          Operand lhs;
2236          if (const_insert && const_bitmask) {
2237             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2238          } else {
2239             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2240             lhs = Operand(insert);
2241          }
2242
2243          Operand rhs;
2244          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2245          if (const_base && const_bitmask) {
2246             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2247          } else {
2248             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2249             rhs = Operand(base);
2250          }
2251
2252          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2253
2254       } else if (dst.regClass() == v1) {
2255          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2256             base = as_vgpr(ctx, base);
2257          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2258             insert = as_vgpr(ctx, insert);
2259
2260          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2261
2262       } else {
2263          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2264          nir_print_instr(&instr->instr, stderr);
2265          fprintf(stderr, "\n");
2266       }
2267       break;
2268    }
2269    case nir_op_ubfe:
2270    case nir_op_ibfe: {
2271       Temp base = get_alu_src(ctx, instr->src[0]);
2272       Temp offset = get_alu_src(ctx, instr->src[1]);
2273       Temp bits = get_alu_src(ctx, instr->src[2]);
2274
2275       if (dst.type() == RegType::sgpr) {
2276          Operand extract;
2277          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2278          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2279          if (const_offset && const_bits) {
2280             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2281             extract = Operand(const_extract);
2282          } else {
2283             Operand width;
2284             if (const_bits) {
2285                width = Operand(const_bits->u32 << 16);
2286             } else {
2287                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2288             }
2289             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2290          }
2291
2292          aco_opcode opcode;
2293          if (dst.regClass() == s1) {
2294             if (instr->op == nir_op_ubfe)
2295                opcode = aco_opcode::s_bfe_u32;
2296             else
2297                opcode = aco_opcode::s_bfe_i32;
2298          } else if (dst.regClass() == s2) {
2299             if (instr->op == nir_op_ubfe)
2300                opcode = aco_opcode::s_bfe_u64;
2301             else
2302                opcode = aco_opcode::s_bfe_i64;
2303          } else {
2304             unreachable("Unsupported BFE bit size");
2305          }
2306
2307          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2308
2309       } else {
2310          aco_opcode opcode;
2311          if (dst.regClass() == v1) {
2312             if (instr->op == nir_op_ubfe)
2313                opcode = aco_opcode::v_bfe_u32;
2314             else
2315                opcode = aco_opcode::v_bfe_i32;
2316          } else {
2317             unreachable("Unsupported BFE bit size");
2318          }
2319
2320          emit_vop3a_instruction(ctx, instr, opcode, dst);
2321       }
2322       break;
2323    }
2324    case nir_op_bit_count: {
2325       Temp src = get_alu_src(ctx, instr->src[0]);
2326       if (src.regClass() == s1) {
2327          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2328       } else if (src.regClass() == v1) {
2329          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2330       } else if (src.regClass() == v2) {
2331          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2332                   emit_extract_vector(ctx, src, 1, v1),
2333                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2334                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2335       } else if (src.regClass() == s2) {
2336          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2337       } else {
2338          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2339          nir_print_instr(&instr->instr, stderr);
2340          fprintf(stderr, "\n");
2341       }
2342       break;
2343    }
2344    case nir_op_flt: {
2345       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2346       break;
2347    }
2348    case nir_op_fge: {
2349       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2350       break;
2351    }
2352    case nir_op_feq: {
2353       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2354       break;
2355    }
2356    case nir_op_fne: {
2357       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2358       break;
2359    }
2360    case nir_op_ilt: {
2361       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2362       break;
2363    }
2364    case nir_op_ige: {
2365       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
2366       break;
2367    }
2368    case nir_op_ieq: {
2369       if (instr->src[0].src.ssa->bit_size == 1)
2370          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
2371       else
2372          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, aco_opcode::s_cmp_eq_u64);
2373       break;
2374    }
2375    case nir_op_ine: {
2376       if (instr->src[0].src.ssa->bit_size == 1)
2377          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
2378       else
2379          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, aco_opcode::s_cmp_lg_u64);
2380       break;
2381    }
2382    case nir_op_ult: {
2383       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
2384       break;
2385    }
2386    case nir_op_uge: {
2387       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
2388       break;
2389    }
2390    case nir_op_fddx:
2391    case nir_op_fddy:
2392    case nir_op_fddx_fine:
2393    case nir_op_fddy_fine:
2394    case nir_op_fddx_coarse:
2395    case nir_op_fddy_coarse: {
2396       Temp src = get_alu_src(ctx, instr->src[0]);
2397       uint16_t dpp_ctrl1, dpp_ctrl2;
2398       if (instr->op == nir_op_fddx_fine) {
2399          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
2400          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
2401       } else if (instr->op == nir_op_fddy_fine) {
2402          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
2403          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
2404       } else {
2405          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
2406          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2407             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
2408          else
2409             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
2410       }
2411
2412       Temp tmp;
2413       if (ctx->program->chip_class >= GFX8) {
2414          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
2415          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
2416       } else {
2417          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
2418          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
2419          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
2420       }
2421       emit_wqm(ctx, tmp, dst, true);
2422       break;
2423    }
2424    default:
2425       fprintf(stderr, "Unknown NIR ALU instr: ");
2426       nir_print_instr(&instr->instr, stderr);
2427       fprintf(stderr, "\n");
2428    }
2429 }
2430
2431 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2432 {
2433    Temp dst = get_ssa_temp(ctx, &instr->def);
2434
2435    // TODO: we really want to have the resulting type as this would allow for 64bit literals
2436    // which get truncated the lsb if double and msb if int
2437    // for now, we only use s_mov_b64 with 64bit inline constants
2438    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2439    assert(dst.type() == RegType::sgpr);
2440
2441    Builder bld(ctx->program, ctx->block);
2442
2443    if (instr->def.bit_size == 1) {
2444       assert(dst.regClass() == bld.lm);
2445       int val = instr->value[0].b ? -1 : 0;
2446       Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
2447       bld.sop1(Builder::s_mov, Definition(dst), op);
2448    } else if (dst.size() == 1) {
2449       bld.copy(Definition(dst), Operand(instr->value[0].u32));
2450    } else {
2451       assert(dst.size() != 1);
2452       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2453       if (instr->def.bit_size == 64)
2454          for (unsigned i = 0; i < dst.size(); i++)
2455             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2456       else {
2457          for (unsigned i = 0; i < dst.size(); i++)
2458             vec->operands[i] = Operand{instr->value[i].u32};
2459       }
2460       vec->definitions[0] = Definition(dst);
2461       ctx->block->instructions.emplace_back(std::move(vec));
2462    }
2463 }
2464
2465 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2466 {
2467    uint32_t new_mask = 0;
2468    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2469       if (mask & (1u << i))
2470          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2471    return new_mask;
2472 }
2473
2474 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2475 {
2476    /* This wouldn't work inside control flow or with indirect offsets but
2477     * that doesn't happen because of nir_lower_io_to_temporaries(). */
2478
2479    unsigned write_mask = nir_intrinsic_write_mask(instr);
2480    unsigned component = nir_intrinsic_component(instr);
2481    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2482    unsigned idx = nir_intrinsic_base(instr) + component;
2483
2484    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2485    if (off_instr->type != nir_instr_type_load_const) {
2486       fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2487       nir_print_instr(off_instr, stderr);
2488       fprintf(stderr, "\n");
2489    }
2490    idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2491
2492    if (instr->src[0].ssa->bit_size == 64)
2493       write_mask = widen_mask(write_mask, 2);
2494
2495    for (unsigned i = 0; i < 8; ++i) {
2496       if (write_mask & (1 << i)) {
2497          ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2498          ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2499       }
2500       idx++;
2501    }
2502 }
2503
2504 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2505 {
2506    Builder bld(ctx->program, ctx->block);
2507    unsigned write_mask = nir_intrinsic_write_mask(instr);
2508    Operand values[4];
2509    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2510    for (unsigned i = 0; i < 4; ++i) {
2511       if (write_mask & (1 << i)) {
2512          Temp tmp = emit_extract_vector(ctx, src, i, v1);
2513          values[i] = Operand(tmp);
2514       } else {
2515          values[i] = Operand(v1);
2516       }
2517    }
2518
2519    unsigned index = nir_intrinsic_base(instr) / 4;
2520    unsigned target, col_format;
2521    unsigned enabled_channels = 0xF;
2522    aco_opcode compr_op = (aco_opcode)0;
2523
2524    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2525    assert(offset && "Non-const offsets on exports not yet supported");
2526    index += offset->u32;
2527
2528    assert(index != FRAG_RESULT_COLOR);
2529
2530    /* Unlike vertex shader exports, it's fine to use multiple exports to
2531     * export separate channels of one target. So shaders which export both
2532     * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2533     * TODO: combine the exports in those cases and create better code
2534     */
2535
2536    if (index == FRAG_RESULT_SAMPLE_MASK) {
2537
2538       if (ctx->program->info->ps.writes_z) {
2539          target = V_008DFC_SQ_EXP_MRTZ;
2540          enabled_channels = 0x4;
2541          col_format = (unsigned) -1;
2542
2543          values[2] = values[0];
2544          values[0] = Operand(v1);
2545       } else {
2546          bld.exp(aco_opcode::exp, Operand(v1), Operand(values[0]), Operand(v1), Operand(v1),
2547                  0xc, V_008DFC_SQ_EXP_MRTZ, true);
2548          return;
2549       }
2550
2551    } else if (index == FRAG_RESULT_DEPTH) {
2552
2553       target = V_008DFC_SQ_EXP_MRTZ;
2554       enabled_channels = 0x1;
2555       col_format = (unsigned) -1;
2556
2557    } else if (index == FRAG_RESULT_STENCIL) {
2558
2559       if (ctx->program->info->ps.writes_z) {
2560          target = V_008DFC_SQ_EXP_MRTZ;
2561          enabled_channels = 0x2;
2562          col_format = (unsigned) -1;
2563
2564          values[1] = values[0];
2565          values[0] = Operand(v1);
2566       } else {
2567          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
2568          bld.exp(aco_opcode::exp, values[0], Operand(v1), Operand(v1), Operand(v1),
2569                  0x3, V_008DFC_SQ_EXP_MRTZ, true);
2570          return;
2571       }
2572
2573    } else {
2574       index -= FRAG_RESULT_DATA0;
2575       target = V_008DFC_SQ_EXP_MRT + index;
2576       col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2577    }
2578    bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2579    bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2580
2581    switch (col_format)
2582    {
2583    case V_028714_SPI_SHADER_ZERO:
2584       enabled_channels = 0; /* writemask */
2585       target = V_008DFC_SQ_EXP_NULL;
2586       break;
2587
2588    case V_028714_SPI_SHADER_32_R:
2589       enabled_channels = 1;
2590       break;
2591
2592    case V_028714_SPI_SHADER_32_GR:
2593       enabled_channels = 0x3;
2594       break;
2595
2596    case V_028714_SPI_SHADER_32_AR:
2597       if (ctx->options->chip_class >= GFX10) {
2598          /* Special case: on GFX10, the outputs are different for 32_AR */
2599          enabled_channels = 0x3;
2600          values[1] = values[3];
2601          values[3] = Operand(v1);
2602       } else {
2603          enabled_channels = 0x9;
2604       }
2605       break;
2606
2607    case V_028714_SPI_SHADER_FP16_ABGR:
2608       enabled_channels = 0x5;
2609       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2610       break;
2611
2612    case V_028714_SPI_SHADER_UNORM16_ABGR:
2613       enabled_channels = 0x5;
2614       compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2615       break;
2616
2617    case V_028714_SPI_SHADER_SNORM16_ABGR:
2618       enabled_channels = 0x5;
2619       compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2620       break;
2621
2622    case V_028714_SPI_SHADER_UINT16_ABGR: {
2623       enabled_channels = 0x5;
2624       compr_op = aco_opcode::v_cvt_pk_u16_u32;
2625       if (is_int8 || is_int10) {
2626          /* clamp */
2627          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
2628          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
2629
2630          for (unsigned i = 0; i < 4; i++) {
2631             if ((write_mask >> i) & 1) {
2632                values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
2633                                     i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
2634                                     values[i]);
2635             }
2636          }
2637       }
2638       break;
2639    }
2640
2641    case V_028714_SPI_SHADER_SINT16_ABGR:
2642       enabled_channels = 0x5;
2643       compr_op = aco_opcode::v_cvt_pk_i16_i32;
2644       if (is_int8 || is_int10) {
2645          /* clamp */
2646          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
2647          uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
2648          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
2649          Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
2650
2651          for (unsigned i = 0; i < 4; i++) {
2652             if ((write_mask >> i) & 1) {
2653                values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
2654                                     i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
2655                                     values[i]);
2656                values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
2657                                     i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
2658                                     values[i]);
2659             }
2660          }
2661       }
2662       break;
2663
2664    case V_028714_SPI_SHADER_32_ABGR:
2665       enabled_channels = 0xF;
2666       break;
2667
2668    default:
2669       break;
2670    }
2671
2672    if (target == V_008DFC_SQ_EXP_NULL)
2673       return;
2674
2675    if ((bool) compr_op) {
2676       for (int i = 0; i < 2; i++) {
2677          /* check if at least one of the values to be compressed is enabled */
2678          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2679          if (enabled) {
2680             enabled_channels |= enabled << (i*2);
2681             values[i] = bld.vop3(compr_op, bld.def(v1),
2682                                  values[i*2].isUndefined() ? Operand(0u) : values[i*2],
2683                                  values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
2684          } else {
2685             values[i] = Operand(v1);
2686          }
2687       }
2688       values[2] = Operand(v1);
2689       values[3] = Operand(v1);
2690    } else {
2691       for (int i = 0; i < 4; i++)
2692          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2693    }
2694
2695    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
2696            enabled_channels, target, (bool) compr_op);
2697 }
2698
2699 Operand load_lds_size_m0(isel_context *ctx)
2700 {
2701    /* TODO: m0 does not need to be initialized on GFX9+ */
2702    Builder bld(ctx->program, ctx->block);
2703    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
2704 }
2705
2706 void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
2707               Temp address, unsigned base_offset, unsigned align)
2708 {
2709    assert(util_is_power_of_two_nonzero(align) && align >= 4);
2710
2711    Builder bld(ctx->program, ctx->block);
2712
2713    Operand m = load_lds_size_m0(ctx);
2714
2715    unsigned num_components = dst.size() * 4u / elem_size_bytes;
2716    unsigned bytes_read = 0;
2717    unsigned result_size = 0;
2718    unsigned total_bytes = num_components * elem_size_bytes;
2719    std::array<Temp, 4> result;
2720
2721    while (bytes_read < total_bytes) {
2722       unsigned todo = total_bytes - bytes_read;
2723       bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
2724       bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
2725
2726       aco_opcode op = aco_opcode::last_opcode;
2727       bool read2 = false;
2728       if (todo >= 16 && aligned16) {
2729          op = aco_opcode::ds_read_b128;
2730          todo = 16;
2731       } else if (todo >= 16 && aligned8) {
2732          op = aco_opcode::ds_read2_b64;
2733          read2 = true;
2734          todo = 16;
2735       } else if (todo >= 12 && aligned16) {
2736          op = aco_opcode::ds_read_b96;
2737          todo = 12;
2738       } else if (todo >= 8 && aligned8) {
2739          op = aco_opcode::ds_read_b64;
2740          todo = 8;
2741       } else if (todo >= 8) {
2742          op = aco_opcode::ds_read2_b32;
2743          read2 = true;
2744          todo = 8;
2745       } else if (todo >= 4) {
2746          op = aco_opcode::ds_read_b32;
2747          todo = 4;
2748       } else {
2749          assert(false);
2750       }
2751       assert(todo % elem_size_bytes == 0);
2752       unsigned num_elements = todo / elem_size_bytes;
2753       unsigned offset = base_offset + bytes_read;
2754       unsigned max_offset = read2 ? 1019 : 65535;
2755
2756       Temp address_offset = address;
2757       if (offset > max_offset) {
2758          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2759          offset = bytes_read;
2760       }
2761       assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
2762
2763       Temp res;
2764       if (num_components == 1 && dst.type() == RegType::vgpr)
2765          res = dst;
2766       else
2767          res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
2768
2769       if (read2)
2770          res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
2771       else
2772          res = bld.ds(op, Definition(res), address_offset, m, offset);
2773
2774       if (num_components == 1) {
2775          assert(todo == total_bytes);
2776          if (dst.type() == RegType::sgpr)
2777             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
2778          return;
2779       }
2780
2781       if (dst.type() == RegType::sgpr)
2782          res = bld.as_uniform(res);
2783
2784       if (num_elements == 1) {
2785          result[result_size++] = res;
2786       } else {
2787          assert(res != dst && res.size() % num_elements == 0);
2788          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
2789          split->operands[0] = Operand(res);
2790          for (unsigned i = 0; i < num_elements; i++)
2791             split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
2792          ctx->block->instructions.emplace_back(std::move(split));
2793       }
2794
2795       bytes_read += todo;
2796    }
2797
2798    assert(result_size == num_components && result_size > 1);
2799    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
2800    for (unsigned i = 0; i < result_size; i++)
2801       vec->operands[i] = Operand(result[i]);
2802    vec->definitions[0] = Definition(dst);
2803    ctx->block->instructions.emplace_back(std::move(vec));
2804    ctx->allocated_vec.emplace(dst.id(), result);
2805 }
2806
2807 Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
2808 {
2809    if (start == 0 && size == data.size())
2810       return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
2811
2812    unsigned size_hint = 1;
2813    auto it = ctx->allocated_vec.find(data.id());
2814    if (it != ctx->allocated_vec.end())
2815       size_hint = it->second[0].size();
2816    if (size % size_hint || start % size_hint)
2817       size_hint = 1;
2818
2819    start /= size_hint;
2820    size /= size_hint;
2821
2822    Temp elems[size];
2823    for (unsigned i = 0; i < size; i++)
2824       elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
2825
2826    if (size == 1)
2827       return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
2828
2829    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
2830    for (unsigned i = 0; i < size; i++)
2831       vec->operands[i] = Operand(elems[i]);
2832    Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
2833    vec->definitions[0] = Definition(res);
2834    ctx->block->instructions.emplace_back(std::move(vec));
2835    return res;
2836 }
2837
2838 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
2839 {
2840    Builder bld(ctx->program, ctx->block);
2841    unsigned bytes_written = 0;
2842    while (bytes_written < total_size * 4) {
2843       unsigned todo = total_size * 4 - bytes_written;
2844       bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
2845       bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
2846
2847       aco_opcode op = aco_opcode::last_opcode;
2848       bool write2 = false;
2849       unsigned size = 0;
2850       if (todo >= 16 && aligned16) {
2851          op = aco_opcode::ds_write_b128;
2852          size = 4;
2853       } else if (todo >= 16 && aligned8) {
2854          op = aco_opcode::ds_write2_b64;
2855          write2 = true;
2856          size = 4;
2857       } else if (todo >= 12 && aligned16) {
2858          op = aco_opcode::ds_write_b96;
2859          size = 3;
2860       } else if (todo >= 8 && aligned8) {
2861          op = aco_opcode::ds_write_b64;
2862          size = 2;
2863       } else if (todo >= 8) {
2864          op = aco_opcode::ds_write2_b32;
2865          write2 = true;
2866          size = 2;
2867       } else if (todo >= 4) {
2868          op = aco_opcode::ds_write_b32;
2869          size = 1;
2870       } else {
2871          assert(false);
2872       }
2873
2874       unsigned offset = offset0 + offset1 + bytes_written;
2875       unsigned max_offset = write2 ? 1020 : 65535;
2876       Temp address_offset = address;
2877       if (offset > max_offset) {
2878          address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
2879          offset = offset1 + bytes_written;
2880       }
2881       assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
2882
2883       if (write2) {
2884          Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
2885          Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
2886          bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
2887       } else {
2888          Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
2889          bld.ds(op, address_offset, val, m, offset);
2890       }
2891
2892       bytes_written += size * 4;
2893    }
2894 }
2895
2896 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
2897                Temp address, unsigned base_offset, unsigned align)
2898 {
2899    assert(util_is_power_of_two_nonzero(align) && align >= 4);
2900
2901    Operand m = load_lds_size_m0(ctx);
2902
2903    /* we need at most two stores for 32bit variables */
2904    int start[2], count[2];
2905    u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
2906    u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
2907    assert(wrmask == 0);
2908
2909    /* one combined store is sufficient */
2910    if (count[0] == count[1]) {
2911       Builder bld(ctx->program, ctx->block);
2912
2913       Temp address_offset = address;
2914       if ((base_offset >> 2) + start[1] > 255) {
2915          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2916          base_offset = 0;
2917       }
2918
2919       assert(count[0] == 1);
2920       Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
2921       Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
2922       aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
2923       base_offset = base_offset / elem_size_bytes;
2924       bld.ds(op, address_offset, val0, val1, m,
2925              base_offset + start[0], base_offset + start[1]);
2926       return;
2927    }
2928
2929    for (unsigned i = 0; i < 2; i++) {
2930       if (count[i] == 0)
2931          continue;
2932
2933       unsigned elem_size_words = elem_size_bytes / 4;
2934       ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
2935                       base_offset, start[i] * elem_size_bytes, align);
2936    }
2937    return;
2938 }
2939
2940 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2941 {
2942    if (ctx->stage == vertex_vs) {
2943       visit_store_vs_output(ctx, instr);
2944    } else if (ctx->stage == fragment_fs) {
2945       visit_store_fs_output(ctx, instr);
2946    } else {
2947       unreachable("Shader stage not implemented");
2948    }
2949 }
2950
2951 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2952 {
2953    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2954    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2955
2956    Builder bld(ctx->program, ctx->block);
2957    Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2958    bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2959 }
2960
2961 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2962 {
2963    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2964    for (unsigned i = 0; i < num_components; i++)
2965       vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
2966    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
2967       assert(num_components == 4);
2968       Builder bld(ctx->program, ctx->block);
2969       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
2970    }
2971
2972    for (Operand& op : vec->operands)
2973       op = op.isUndefined() ? Operand(0u) : op;
2974
2975    vec->definitions[0] = Definition(dst);
2976    ctx->block->instructions.emplace_back(std::move(vec));
2977    emit_split_vector(ctx, dst, num_components);
2978    return;
2979 }
2980
2981 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2982 {
2983    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2984    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2985    unsigned idx = nir_intrinsic_base(instr);
2986    unsigned component = nir_intrinsic_component(instr);
2987    Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
2988
2989    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2990    if (offset) {
2991       assert(offset->u32 == 0);
2992    } else {
2993       /* the lower 15bit of the prim_mask contain the offset into LDS
2994        * while the upper bits contain the number of prims */
2995       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2996       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2997       Builder bld(ctx->program, ctx->block);
2998       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2999       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3000       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3001       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3002       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3003    }
3004
3005    if (instr->dest.ssa.num_components == 1) {
3006       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
3007    } else {
3008       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
3009       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
3010       {
3011          Temp tmp = {ctx->program->allocateId(), v1};
3012          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
3013          vec->operands[i] = Operand(tmp);
3014       }
3015       vec->definitions[0] = Definition(dst);
3016       ctx->block->instructions.emplace_back(std::move(vec));
3017    }
3018 }
3019
3020 unsigned get_num_channels_from_data_format(unsigned data_format)
3021 {
3022    switch (data_format) {
3023    case V_008F0C_BUF_DATA_FORMAT_8:
3024    case V_008F0C_BUF_DATA_FORMAT_16:
3025    case V_008F0C_BUF_DATA_FORMAT_32:
3026       return 1;
3027    case V_008F0C_BUF_DATA_FORMAT_8_8:
3028    case V_008F0C_BUF_DATA_FORMAT_16_16:
3029    case V_008F0C_BUF_DATA_FORMAT_32_32:
3030       return 2;
3031    case V_008F0C_BUF_DATA_FORMAT_10_11_11:
3032    case V_008F0C_BUF_DATA_FORMAT_11_11_10:
3033    case V_008F0C_BUF_DATA_FORMAT_32_32_32:
3034       return 3;
3035    case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
3036    case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
3037    case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
3038    case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
3039    case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
3040       return 4;
3041    default:
3042       break;
3043    }
3044
3045    return 4;
3046 }
3047
3048 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
3049  * so we may need to fix it up. */
3050 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
3051 {
3052    Builder bld(ctx->program, ctx->block);
3053
3054    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
3055       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
3056
3057    /* For the integer-like cases, do a natural sign extension.
3058     *
3059     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
3060     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
3061     * exponent.
3062     */
3063    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
3064    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
3065
3066    /* Convert back to the right type. */
3067    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
3068       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3069       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
3070       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
3071    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
3072       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
3073    }
3074
3075    return alpha;
3076 }
3077
3078 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
3079 {
3080    Builder bld(ctx->program, ctx->block);
3081    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3082    if (ctx->stage & sw_vs) {
3083
3084       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3085       if (off_instr->type != nir_instr_type_load_const) {
3086          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3087          nir_print_instr(off_instr, stderr);
3088          fprintf(stderr, "\n");
3089       }
3090       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
3091
3092       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
3093
3094       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
3095       unsigned component = nir_intrinsic_component(instr);
3096       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
3097       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
3098       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
3099       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
3100
3101       unsigned dfmt = attrib_format & 0xf;
3102
3103       unsigned nfmt = (attrib_format >> 4) & 0x7;
3104       unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
3105       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
3106       unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
3107       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
3108       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
3109       if (post_shuffle)
3110          num_channels = MAX2(num_channels, 3);
3111
3112       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
3113
3114       Temp index;
3115       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
3116          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
3117          Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
3118          if (divisor) {
3119             ctx->needs_instance_id = true;
3120             Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
3121             if (divisor != 1) {
3122                Temp divided = bld.tmp(v1);
3123                emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
3124                index = bld.vadd32(bld.def(v1), start_instance, divided);
3125             } else {
3126                index = bld.vadd32(bld.def(v1), start_instance, instance_id);
3127             }
3128          } else {
3129             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
3130          }
3131       } else {
3132          index = bld.vadd32(bld.def(v1),
3133                             get_arg(ctx, ctx->args->ac.base_vertex),
3134                             get_arg(ctx, ctx->args->ac.vertex_id));
3135       }
3136
3137       if (attrib_stride != 0 && attrib_offset > attrib_stride) {
3138          index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
3139          attrib_offset = attrib_offset % attrib_stride;
3140       }
3141
3142       Operand soffset(0u);
3143       if (attrib_offset >= 4096) {
3144          soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
3145          attrib_offset = 0;
3146       }
3147
3148       aco_opcode opcode;
3149       switch (num_channels) {
3150       case 1:
3151          opcode = aco_opcode::tbuffer_load_format_x;
3152          break;
3153       case 2:
3154          opcode = aco_opcode::tbuffer_load_format_xy;
3155          break;
3156       case 3:
3157          opcode = aco_opcode::tbuffer_load_format_xyz;
3158          break;
3159       case 4:
3160          opcode = aco_opcode::tbuffer_load_format_xyzw;
3161          break;
3162       default:
3163          unreachable("Unimplemented load_input vector size");
3164       }
3165
3166       Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
3167
3168       aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
3169       mubuf->operands[0] = Operand(index);
3170       mubuf->operands[1] = Operand(list);
3171       mubuf->operands[2] = soffset;
3172       mubuf->definitions[0] = Definition(tmp);
3173       mubuf->idxen = true;
3174       mubuf->can_reorder = true;
3175       mubuf->dfmt = dfmt;
3176       mubuf->nfmt = nfmt;
3177       assert(attrib_offset < 4096);
3178       mubuf->offset = attrib_offset;
3179       ctx->block->instructions.emplace_back(std::move(mubuf));
3180
3181       emit_split_vector(ctx, tmp, tmp.size());
3182
3183       if (tmp.id() != dst.id()) {
3184          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
3185                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
3186
3187          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
3188          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
3189          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
3190
3191          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3192          for (unsigned i = 0; i < dst.size(); i++) {
3193             unsigned idx = i + component;
3194             if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
3195                Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
3196                vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
3197             } else if (idx < num_channels) {
3198                vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
3199             } else if (is_float && idx == 3) {
3200                vec->operands[i] = Operand(0x3f800000u);
3201             } else if (!is_float && idx == 3) {
3202                vec->operands[i] = Operand(1u);
3203             } else {
3204                vec->operands[i] = Operand(0u);
3205             }
3206          }
3207          vec->definitions[0] = Definition(dst);
3208          ctx->block->instructions.emplace_back(std::move(vec));
3209          emit_split_vector(ctx, dst, dst.size());
3210       }
3211
3212    } else if (ctx->stage == fragment_fs) {
3213       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3214       if (off_instr->type != nir_instr_type_load_const ||
3215           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
3216          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3217          nir_print_instr(off_instr, stderr);
3218          fprintf(stderr, "\n");
3219       }
3220
3221       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
3222       nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
3223       if (offset) {
3224          assert(offset->u32 == 0);
3225       } else {
3226          /* the lower 15bit of the prim_mask contain the offset into LDS
3227           * while the upper bits contain the number of prims */
3228          Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
3229          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3230          Builder bld(ctx->program, ctx->block);
3231          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3232          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3233          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3234          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3235          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3236       }
3237
3238       unsigned idx = nir_intrinsic_base(instr);
3239       unsigned component = nir_intrinsic_component(instr);
3240
3241       if (dst.size() == 1) {
3242          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
3243       } else {
3244          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3245          for (unsigned i = 0; i < dst.size(); i++)
3246             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
3247          vec->definitions[0] = Definition(dst);
3248          bld.insert(std::move(vec));
3249       }
3250
3251    } else {
3252       unreachable("Shader stage not implemented");
3253    }
3254 }
3255
3256 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
3257 {
3258    if (ctx->program->info->need_indirect_descriptor_sets) {
3259       Builder bld(ctx->program, ctx->block);
3260       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
3261       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
3262    }
3263
3264    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
3265 }
3266
3267
3268 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
3269 {
3270    Builder bld(ctx->program, ctx->block);
3271    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
3272    if (!ctx->divergent_vals[instr->dest.ssa.index])
3273       index = bld.as_uniform(index);
3274    unsigned desc_set = nir_intrinsic_desc_set(instr);
3275    unsigned binding = nir_intrinsic_binding(instr);
3276
3277    Temp desc_ptr;
3278    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
3279    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
3280    unsigned offset = layout->binding[binding].offset;
3281    unsigned stride;
3282    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3283        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
3284       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
3285       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
3286       offset = pipeline_layout->push_constant_size + 16 * idx;
3287       stride = 16;
3288    } else {
3289       desc_ptr = load_desc_ptr(ctx, desc_set);
3290       stride = layout->binding[binding].size;
3291    }
3292
3293    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
3294    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
3295    if (stride != 1) {
3296       if (nir_const_index) {
3297          const_index = const_index * stride;
3298       } else if (index.type() == RegType::vgpr) {
3299          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
3300          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
3301       } else {
3302          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
3303       }
3304    }
3305    if (offset) {
3306       if (nir_const_index) {
3307          const_index = const_index + offset;
3308       } else if (index.type() == RegType::vgpr) {
3309          index = bld.vadd32(bld.def(v1), Operand(offset), index);
3310       } else {
3311          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
3312       }
3313    }
3314
3315    if (nir_const_index && const_index == 0) {
3316       index = desc_ptr;
3317    } else if (index.type() == RegType::vgpr) {
3318       index = bld.vadd32(bld.def(v1),
3319                          nir_const_index ? Operand(const_index) : Operand(index),
3320                          Operand(desc_ptr));
3321    } else {
3322       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3323                        nir_const_index ? Operand(const_index) : Operand(index),
3324                        Operand(desc_ptr));
3325    }
3326
3327    bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
3328 }
3329
3330 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst,
3331                  Temp rsrc, Temp offset, bool glc=false, bool readonly=true)
3332 {
3333    Builder bld(ctx->program, ctx->block);
3334
3335    unsigned num_bytes = dst.size() * 4;
3336    bool dlc = glc && ctx->options->chip_class >= GFX10;
3337
3338    aco_opcode op;
3339    if (dst.type() == RegType::vgpr || (ctx->options->chip_class < GFX8 && !readonly)) {
3340       if (ctx->options->chip_class < GFX8)
3341          offset = as_vgpr(ctx, offset);
3342
3343       Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3344       Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3345       unsigned const_offset = 0;
3346
3347       Temp lower = Temp();
3348       if (num_bytes > 16) {
3349          assert(num_components == 3 || num_components == 4);
3350          op = aco_opcode::buffer_load_dwordx4;
3351          lower = bld.tmp(v4);
3352          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3353          mubuf->definitions[0] = Definition(lower);
3354          mubuf->operands[0] = vaddr;
3355          mubuf->operands[1] = Operand(rsrc);
3356          mubuf->operands[2] = soffset;
3357          mubuf->offen = (offset.type() == RegType::vgpr);
3358          mubuf->glc = glc;
3359          mubuf->dlc = dlc;
3360          mubuf->barrier = readonly ? barrier_none : barrier_buffer;
3361          mubuf->can_reorder = readonly;
3362          bld.insert(std::move(mubuf));
3363          emit_split_vector(ctx, lower, 2);
3364          num_bytes -= 16;
3365          const_offset = 16;
3366       }
3367
3368       switch (num_bytes) {
3369          case 4:
3370             op = aco_opcode::buffer_load_dword;
3371             break;
3372          case 8:
3373             op = aco_opcode::buffer_load_dwordx2;
3374             break;
3375          case 12:
3376             op = aco_opcode::buffer_load_dwordx3;
3377             break;
3378          case 16:
3379             op = aco_opcode::buffer_load_dwordx4;
3380             break;
3381          default:
3382             unreachable("Load SSBO not implemented for this size.");
3383       }
3384       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3385       mubuf->operands[0] = vaddr;
3386       mubuf->operands[1] = Operand(rsrc);
3387       mubuf->operands[2] = soffset;
3388       mubuf->offen = (offset.type() == RegType::vgpr);
3389       mubuf->glc = glc;
3390       mubuf->dlc = dlc;
3391       mubuf->barrier = readonly ? barrier_none : barrier_buffer;
3392       mubuf->can_reorder = readonly;
3393       mubuf->offset = const_offset;
3394       aco_ptr<Instruction> instr = std::move(mubuf);
3395
3396       if (dst.size() > 4) {
3397          assert(lower != Temp());
3398          Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3399          instr->definitions[0] = Definition(upper);
3400          bld.insert(std::move(instr));
3401          if (dst.size() == 8)
3402             emit_split_vector(ctx, upper, 2);
3403          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3404          instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3405          instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3406          instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3407          if (dst.size() == 8)
3408             instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3409       }
3410
3411       if (dst.type() == RegType::sgpr) {
3412          Temp vec = bld.tmp(RegType::vgpr, dst.size());
3413          instr->definitions[0] = Definition(vec);
3414          bld.insert(std::move(instr));
3415          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3416       } else {
3417          instr->definitions[0] = Definition(dst);
3418          bld.insert(std::move(instr));
3419       }
3420    } else {
3421       switch (num_bytes) {
3422          case 4:
3423             op = aco_opcode::s_buffer_load_dword;
3424             break;
3425          case 8:
3426             op = aco_opcode::s_buffer_load_dwordx2;
3427             break;
3428          case 12:
3429          case 16:
3430             op = aco_opcode::s_buffer_load_dwordx4;
3431             break;
3432          case 24:
3433          case 32:
3434             op = aco_opcode::s_buffer_load_dwordx8;
3435             break;
3436          default:
3437             unreachable("Load SSBO not implemented for this size.");
3438       }
3439       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3440       load->operands[0] = Operand(rsrc);
3441       load->operands[1] = Operand(bld.as_uniform(offset));
3442       assert(load->operands[1].getTemp().type() == RegType::sgpr);
3443       load->definitions[0] = Definition(dst);
3444       load->glc = glc;
3445       load->dlc = dlc;
3446       load->barrier = readonly ? barrier_none : barrier_buffer;
3447       load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
3448       assert(ctx->options->chip_class >= GFX8 || !glc);
3449
3450       /* trim vector */
3451       if (dst.size() == 3) {
3452          Temp vec = bld.tmp(s4);
3453          load->definitions[0] = Definition(vec);
3454          bld.insert(std::move(load));
3455          emit_split_vector(ctx, vec, 4);
3456
3457          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3458                     emit_extract_vector(ctx, vec, 0, s1),
3459                     emit_extract_vector(ctx, vec, 1, s1),
3460                     emit_extract_vector(ctx, vec, 2, s1));
3461       } else if (dst.size() == 6) {
3462          Temp vec = bld.tmp(s8);
3463          load->definitions[0] = Definition(vec);
3464          bld.insert(std::move(load));
3465          emit_split_vector(ctx, vec, 4);
3466
3467          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3468                     emit_extract_vector(ctx, vec, 0, s2),
3469                     emit_extract_vector(ctx, vec, 1, s2),
3470                     emit_extract_vector(ctx, vec, 2, s2));
3471       } else {
3472          bld.insert(std::move(load));
3473       }
3474
3475    }
3476    emit_split_vector(ctx, dst, num_components);
3477 }
3478
3479 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3480 {
3481    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3482    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3483
3484    Builder bld(ctx->program, ctx->block);
3485
3486    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3487    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3488    unsigned binding = nir_intrinsic_binding(idx_instr);
3489    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3490
3491    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3492       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3493                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3494                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3495                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3496       if (ctx->options->chip_class >= GFX10) {
3497          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3498                       S_008F0C_OOB_SELECT(3) |
3499                       S_008F0C_RESOURCE_LEVEL(1);
3500       } else {
3501          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3502                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3503       }
3504       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3505                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3506                                      Operand(0xFFFFFFFFu),
3507                                      Operand(desc_type));
3508       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3509                         rsrc, upper_dwords);
3510    } else {
3511       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3512       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3513    }
3514
3515    load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3516 }
3517
3518 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3519 {
3520    Builder bld(ctx->program, ctx->block);
3521    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3522
3523    unsigned offset = nir_intrinsic_base(instr);
3524    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3525    if (index_cv && instr->dest.ssa.bit_size == 32) {
3526
3527       unsigned count = instr->dest.ssa.num_components;
3528       unsigned start = (offset + index_cv->u32) / 4u;
3529       start -= ctx->args->ac.base_inline_push_consts;
3530       if (start + count <= ctx->args->ac.num_inline_push_consts) {
3531          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3532          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3533          for (unsigned i = 0; i < count; ++i) {
3534             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
3535             vec->operands[i] = Operand{elems[i]};
3536          }
3537          vec->definitions[0] = Definition(dst);
3538          ctx->block->instructions.emplace_back(std::move(vec));
3539          ctx->allocated_vec.emplace(dst.id(), elems);
3540          return;
3541       }
3542    }
3543
3544    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3545    if (offset != 0) // TODO check if index != 0 as well
3546       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3547    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
3548    Temp vec = dst;
3549    bool trim = false;
3550    aco_opcode op;
3551
3552    switch (dst.size()) {
3553    case 1:
3554       op = aco_opcode::s_load_dword;
3555       break;
3556    case 2:
3557       op = aco_opcode::s_load_dwordx2;
3558       break;
3559    case 3:
3560       vec = bld.tmp(s4);
3561       trim = true;
3562    case 4:
3563       op = aco_opcode::s_load_dwordx4;
3564       break;
3565    case 6:
3566       vec = bld.tmp(s8);
3567       trim = true;
3568    case 8:
3569       op = aco_opcode::s_load_dwordx8;
3570       break;
3571    default:
3572       unreachable("unimplemented or forbidden load_push_constant.");
3573    }
3574
3575    bld.smem(op, Definition(vec), ptr, index);
3576
3577    if (trim) {
3578       emit_split_vector(ctx, vec, 4);
3579       RegClass rc = dst.size() == 3 ? s1 : s2;
3580       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3581                  emit_extract_vector(ctx, vec, 0, rc),
3582                  emit_extract_vector(ctx, vec, 1, rc),
3583                  emit_extract_vector(ctx, vec, 2, rc));
3584
3585    }
3586    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3587 }
3588
3589 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3590 {
3591    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3592
3593    Builder bld(ctx->program, ctx->block);
3594
3595    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3596                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3597                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3598                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3599    if (ctx->options->chip_class >= GFX10) {
3600       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3601                    S_008F0C_OOB_SELECT(3) |
3602                    S_008F0C_RESOURCE_LEVEL(1);
3603    } else {
3604       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3605                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3606    }
3607
3608    unsigned base = nir_intrinsic_base(instr);
3609    unsigned range = nir_intrinsic_range(instr);
3610
3611    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3612    if (base && offset.type() == RegType::sgpr)
3613       offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3614    else if (base && offset.type() == RegType::vgpr)
3615       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3616
3617    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3618                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3619                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3620                           Operand(desc_type));
3621
3622    load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3623 }
3624
3625 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3626 {
3627    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3628       ctx->cf_info.exec_potentially_empty = true;
3629
3630    ctx->program->needs_exact = true;
3631
3632    // TODO: optimize uniform conditions
3633    Builder bld(ctx->program, ctx->block);
3634    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
3635    assert(src.regClass() == bld.lm);
3636    src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
3637    bld.pseudo(aco_opcode::p_discard_if, src);
3638    ctx->block->kind |= block_kind_uses_discard_if;
3639    return;
3640 }
3641
3642 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3643 {
3644    Builder bld(ctx->program, ctx->block);
3645
3646    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3647       ctx->cf_info.exec_potentially_empty = true;
3648
3649    bool divergent = ctx->cf_info.parent_if.is_divergent ||
3650                     ctx->cf_info.parent_loop.has_divergent_continue;
3651
3652    if (ctx->block->loop_nest_depth &&
3653        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3654       /* we handle discards the same way as jump instructions */
3655       append_logical_end(ctx->block);
3656
3657       /* in loops, discard behaves like break */
3658       Block *linear_target = ctx->cf_info.parent_loop.exit;
3659       ctx->block->kind |= block_kind_discard;
3660
3661       if (!divergent) {
3662          /* uniform discard - loop ends here */
3663          assert(nir_instr_is_last(&instr->instr));
3664          ctx->block->kind |= block_kind_uniform;
3665          ctx->cf_info.has_branch = true;
3666          bld.branch(aco_opcode::p_branch);
3667          add_linear_edge(ctx->block->index, linear_target);
3668          return;
3669       }
3670
3671       /* we add a break right behind the discard() instructions */
3672       ctx->block->kind |= block_kind_break;
3673       unsigned idx = ctx->block->index;
3674
3675       /* remove critical edges from linear CFG */
3676       bld.branch(aco_opcode::p_branch);
3677       Block* break_block = ctx->program->create_and_insert_block();
3678       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3679       break_block->kind |= block_kind_uniform;
3680       add_linear_edge(idx, break_block);
3681       add_linear_edge(break_block->index, linear_target);
3682       bld.reset(break_block);
3683       bld.branch(aco_opcode::p_branch);
3684
3685       Block* continue_block = ctx->program->create_and_insert_block();
3686       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3687       add_linear_edge(idx, continue_block);
3688       append_logical_start(continue_block);
3689       ctx->block = continue_block;
3690
3691       return;
3692    }
3693
3694    /* it can currently happen that NIR doesn't remove the unreachable code */
3695    if (!nir_instr_is_last(&instr->instr)) {
3696       ctx->program->needs_exact = true;
3697       /* save exec somewhere temporarily so that it doesn't get
3698        * overwritten before the discard from outer exec masks */
3699       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
3700       bld.pseudo(aco_opcode::p_discard_if, cond);
3701       ctx->block->kind |= block_kind_uses_discard_if;
3702       return;
3703    }
3704
3705    /* This condition is incorrect for uniformly branched discards in a loop
3706     * predicated by a divergent condition, but the above code catches that case
3707     * and the discard would end up turning into a discard_if.
3708     * For example:
3709     * if (divergent) {
3710     *    while (...) {
3711     *       if (uniform) {
3712     *          discard;
3713     *       }
3714     *    }
3715     * }
3716     */
3717    if (!ctx->cf_info.parent_if.is_divergent) {
3718       /* program just ends here */
3719       ctx->block->kind |= block_kind_uniform;
3720       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3721               0 /* enabled mask */, 9 /* dest */,
3722               false /* compressed */, true/* done */, true /* valid mask */);
3723       bld.sopp(aco_opcode::s_endpgm);
3724       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3725    } else {
3726       ctx->block->kind |= block_kind_discard;
3727       /* branch and linear edge is added by visit_if() */
3728    }
3729 }
3730
3731 enum aco_descriptor_type {
3732    ACO_DESC_IMAGE,
3733    ACO_DESC_FMASK,
3734    ACO_DESC_SAMPLER,
3735    ACO_DESC_BUFFER,
3736    ACO_DESC_PLANE_0,
3737    ACO_DESC_PLANE_1,
3738    ACO_DESC_PLANE_2,
3739 };
3740
3741 static bool
3742 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3743    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3744       return false;
3745    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3746    return dim == ac_image_cube ||
3747           dim == ac_image_1darray ||
3748           dim == ac_image_2darray ||
3749           dim == ac_image_2darraymsaa;
3750 }
3751
3752 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3753                       enum aco_descriptor_type desc_type,
3754                       const nir_tex_instr *tex_instr, bool image, bool write)
3755 {
3756 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3757    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3758    if (it != ctx->tex_desc.end())
3759       return it->second;
3760 */
3761    Temp index = Temp();
3762    bool index_set = false;
3763    unsigned constant_index = 0;
3764    unsigned descriptor_set;
3765    unsigned base_index;
3766    Builder bld(ctx->program, ctx->block);
3767
3768    if (!deref_instr) {
3769       assert(tex_instr && !image);
3770       descriptor_set = 0;
3771       base_index = tex_instr->sampler_index;
3772    } else {
3773       while(deref_instr->deref_type != nir_deref_type_var) {
3774          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3775          if (!array_size)
3776             array_size = 1;
3777
3778          assert(deref_instr->deref_type == nir_deref_type_array);
3779          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3780          if (const_value) {
3781             constant_index += array_size * const_value->u32;
3782          } else {
3783             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
3784             if (indirect.type() == RegType::vgpr)
3785                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
3786
3787             if (array_size != 1)
3788                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3789
3790             if (!index_set) {
3791                index = indirect;
3792                index_set = true;
3793             } else {
3794                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3795             }
3796          }
3797
3798          deref_instr = nir_src_as_deref(deref_instr->parent);
3799       }
3800       descriptor_set = deref_instr->var->data.descriptor_set;
3801       base_index = deref_instr->var->data.binding;
3802    }
3803
3804    Temp list = load_desc_ptr(ctx, descriptor_set);
3805    list = convert_pointer_to_64_bit(ctx, list);
3806
3807    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3808    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3809    unsigned offset = binding->offset;
3810    unsigned stride = binding->size;
3811    aco_opcode opcode;
3812    RegClass type;
3813
3814    assert(base_index < layout->binding_count);
3815
3816    switch (desc_type) {
3817    case ACO_DESC_IMAGE:
3818       type = s8;
3819       opcode = aco_opcode::s_load_dwordx8;
3820       break;
3821    case ACO_DESC_FMASK:
3822       type = s8;
3823       opcode = aco_opcode::s_load_dwordx8;
3824       offset += 32;
3825       break;
3826    case ACO_DESC_SAMPLER:
3827       type = s4;
3828       opcode = aco_opcode::s_load_dwordx4;
3829       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3830          offset += radv_combined_image_descriptor_sampler_offset(binding);
3831       break;
3832    case ACO_DESC_BUFFER:
3833       type = s4;
3834       opcode = aco_opcode::s_load_dwordx4;
3835       break;
3836    case ACO_DESC_PLANE_0:
3837    case ACO_DESC_PLANE_1:
3838       type = s8;
3839       opcode = aco_opcode::s_load_dwordx8;
3840       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3841       break;
3842    case ACO_DESC_PLANE_2:
3843       type = s4;
3844       opcode = aco_opcode::s_load_dwordx4;
3845       offset += 64;
3846       break;
3847    default:
3848       unreachable("invalid desc_type\n");
3849    }
3850
3851    offset += constant_index * stride;
3852
3853    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3854       (!index_set || binding->immutable_samplers_equal)) {
3855       if (binding->immutable_samplers_equal)
3856          constant_index = 0;
3857
3858       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3859       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3860                         Operand(samplers[constant_index * 4 + 0]),
3861                         Operand(samplers[constant_index * 4 + 1]),
3862                         Operand(samplers[constant_index * 4 + 2]),
3863                         Operand(samplers[constant_index * 4 + 3]));
3864    }
3865
3866    Operand off;
3867    if (!index_set) {
3868       off = Operand(offset);
3869    } else {
3870       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3871                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3872    }
3873
3874    Temp res = bld.smem(opcode, bld.def(type), list, off);
3875
3876    if (desc_type == ACO_DESC_PLANE_2) {
3877       Temp components[8];
3878       for (unsigned i = 0; i < 8; i++)
3879          components[i] = bld.tmp(s1);
3880       bld.pseudo(aco_opcode::p_split_vector,
3881                  Definition(components[0]),
3882                  Definition(components[1]),
3883                  Definition(components[2]),
3884                  Definition(components[3]),
3885                  res);
3886
3887       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3888       bld.pseudo(aco_opcode::p_split_vector,
3889                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3890                  Definition(components[4]),
3891                  Definition(components[5]),
3892                  Definition(components[6]),
3893                  Definition(components[7]),
3894                  desc2);
3895
3896       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3897                        components[0], components[1], components[2], components[3],
3898                        components[4], components[5], components[6], components[7]);
3899    }
3900
3901    return res;
3902 }
3903
3904 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3905 {
3906    switch (dim) {
3907    case GLSL_SAMPLER_DIM_BUF:
3908       return 1;
3909    case GLSL_SAMPLER_DIM_1D:
3910       return array ? 2 : 1;
3911    case GLSL_SAMPLER_DIM_2D:
3912       return array ? 3 : 2;
3913    case GLSL_SAMPLER_DIM_MS:
3914       return array ? 4 : 3;
3915    case GLSL_SAMPLER_DIM_3D:
3916    case GLSL_SAMPLER_DIM_CUBE:
3917       return 3;
3918    case GLSL_SAMPLER_DIM_RECT:
3919    case GLSL_SAMPLER_DIM_SUBPASS:
3920       return 2;
3921    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3922       return 3;
3923    default:
3924       break;
3925    }
3926    return 0;
3927 }
3928
3929
3930 /* Adjust the sample index according to FMASK.
3931  *
3932  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3933  * which is the identity mapping. Each nibble says which physical sample
3934  * should be fetched to get that sample.
3935  *
3936  * For example, 0x11111100 means there are only 2 samples stored and
3937  * the second sample covers 3/4 of the pixel. When reading samples 0
3938  * and 1, return physical sample 0 (determined by the first two 0s
3939  * in FMASK), otherwise return physical sample 1.
3940  *
3941  * The sample index should be adjusted as follows:
3942  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
3943  */
3944 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3945 {
3946    Builder bld(ctx->program, ctx->block);
3947    Temp fmask = bld.tmp(v1);
3948    unsigned dim = ctx->options->chip_class >= GFX10
3949                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3950                   : 0;
3951
3952    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3953    load->operands[0] = Operand(coords);
3954    load->operands[1] = Operand(fmask_desc_ptr);
3955    load->definitions[0] = Definition(fmask);
3956    load->glc = false;
3957    load->dlc = false;
3958    load->dmask = 0x1;
3959    load->unrm = true;
3960    load->da = da;
3961    load->dim = dim;
3962    load->can_reorder = true; /* fmask images shouldn't be modified */
3963    ctx->block->instructions.emplace_back(std::move(load));
3964
3965    Operand sample_index4;
3966    if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3967       sample_index4 = Operand(sample_index.constantValue() << 2);
3968    } else if (sample_index.regClass() == s1) {
3969       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3970    } else {
3971       assert(sample_index.regClass() == v1);
3972       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3973    }
3974
3975    Temp final_sample;
3976    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3977       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3978    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3979       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3980    else
3981       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3982
3983    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3984     * resource descriptor is 0 (invalid),
3985     */
3986    Temp compare = bld.tmp(bld.lm);
3987    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3988                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3989
3990    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3991
3992    /* Replace the MSAA sample index. */
3993    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3994 }
3995
3996 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3997 {
3998
3999    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
4000    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4001    bool is_array = glsl_sampler_type_is_array(type);
4002    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
4003    assert(!add_frag_pos && "Input attachments should be lowered.");
4004    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
4005    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
4006    int count = image_type_to_components_count(dim, is_array);
4007    std::vector<Operand> coords(count);
4008
4009    if (is_ms) {
4010       Operand sample_index;
4011       nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
4012       if (sample_cv)
4013          sample_index = Operand(sample_cv->u32);
4014       else
4015          sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
4016
4017       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
4018          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
4019          for (unsigned i = 0; i < vec->operands.size(); i++)
4020             vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
4021          Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
4022          vec->definitions[0] = Definition(fmask_load_address);
4023          ctx->block->instructions.emplace_back(std::move(vec));
4024
4025          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
4026          sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
4027       }
4028       count--;
4029       coords[count] = sample_index;
4030    }
4031
4032    if (count == 1 && !gfx9_1d)
4033       return emit_extract_vector(ctx, src0, 0, v1);
4034
4035    if (gfx9_1d) {
4036       coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
4037       coords.resize(coords.size() + 1);
4038       coords[1] = Operand((uint32_t) 0);
4039       if (is_array)
4040          coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
4041    } else {
4042       for (int i = 0; i < count; i++)
4043          coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
4044    }
4045
4046    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
4047    for (unsigned i = 0; i < coords.size(); i++)
4048       vec->operands[i] = coords[i];
4049    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
4050    vec->definitions[0] = Definition(res);
4051    ctx->block->instructions.emplace_back(std::move(vec));
4052    return res;
4053 }
4054
4055
4056 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
4057 {
4058    Builder bld(ctx->program, ctx->block);
4059    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4060    const struct glsl_type *type = glsl_without_array(var->type);
4061    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4062    bool is_array = glsl_sampler_type_is_array(type);
4063    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4064
4065    if (dim == GLSL_SAMPLER_DIM_BUF) {
4066       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
4067       unsigned num_channels = util_last_bit(mask);
4068       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4069       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4070
4071       aco_opcode opcode;
4072       switch (num_channels) {
4073       case 1:
4074          opcode = aco_opcode::buffer_load_format_x;
4075          break;
4076       case 2:
4077          opcode = aco_opcode::buffer_load_format_xy;
4078          break;
4079       case 3:
4080          opcode = aco_opcode::buffer_load_format_xyz;
4081          break;
4082       case 4:
4083          opcode = aco_opcode::buffer_load_format_xyzw;
4084          break;
4085       default:
4086          unreachable(">4 channel buffer image load");
4087       }
4088       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
4089       load->operands[0] = Operand(vindex);
4090       load->operands[1] = Operand(rsrc);
4091       load->operands[2] = Operand((uint32_t) 0);
4092       Temp tmp;
4093       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4094          tmp = dst;
4095       else
4096          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
4097       load->definitions[0] = Definition(tmp);
4098       load->idxen = true;
4099       load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
4100       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4101       load->barrier = barrier_image;
4102       ctx->block->instructions.emplace_back(std::move(load));
4103
4104       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
4105       return;
4106    }
4107
4108    Temp coords = get_image_coords(ctx, instr, type);
4109    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4110
4111    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
4112    unsigned num_components = util_bitcount(dmask);
4113    Temp tmp;
4114    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4115       tmp = dst;
4116    else
4117       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
4118
4119    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
4120    load->operands[0] = Operand(coords);
4121    load->operands[1] = Operand(resource);
4122    load->definitions[0] = Definition(tmp);
4123    load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
4124    load->dlc = load->glc && ctx->options->chip_class >= GFX10;
4125    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4126    load->dmask = dmask;
4127    load->unrm = true;
4128    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4129    load->barrier = barrier_image;
4130    ctx->block->instructions.emplace_back(std::move(load));
4131
4132    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
4133    return;
4134 }
4135
4136 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
4137 {
4138    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4139    const struct glsl_type *type = glsl_without_array(var->type);
4140    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4141    bool is_array = glsl_sampler_type_is_array(type);
4142    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4143
4144    bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
4145
4146    if (dim == GLSL_SAMPLER_DIM_BUF) {
4147       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4148       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4149       aco_opcode opcode;
4150       switch (data.size()) {
4151       case 1:
4152          opcode = aco_opcode::buffer_store_format_x;
4153          break;
4154       case 2:
4155          opcode = aco_opcode::buffer_store_format_xy;
4156          break;
4157       case 3:
4158          opcode = aco_opcode::buffer_store_format_xyz;
4159          break;
4160       case 4:
4161          opcode = aco_opcode::buffer_store_format_xyzw;
4162          break;
4163       default:
4164          unreachable(">4 channel buffer image store");
4165       }
4166       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
4167       store->operands[0] = Operand(vindex);
4168       store->operands[1] = Operand(rsrc);
4169       store->operands[2] = Operand((uint32_t) 0);
4170       store->operands[3] = Operand(data);
4171       store->idxen = true;
4172       store->glc = glc;
4173       store->dlc = false;
4174       store->disable_wqm = true;
4175       store->barrier = barrier_image;
4176       ctx->program->needs_exact = true;
4177       ctx->block->instructions.emplace_back(std::move(store));
4178       return;
4179    }
4180
4181    assert(data.type() == RegType::vgpr);
4182    Temp coords = get_image_coords(ctx, instr, type);
4183    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4184
4185    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
4186    store->operands[0] = Operand(coords);
4187    store->operands[1] = Operand(resource);
4188    store->operands[2] = Operand(s4);
4189    store->operands[3] = Operand(data);
4190    store->glc = glc;
4191    store->dlc = false;
4192    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4193    store->dmask = (1 << data.size()) - 1;
4194    store->unrm = true;
4195    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4196    store->disable_wqm = true;
4197    store->barrier = barrier_image;
4198    ctx->program->needs_exact = true;
4199    ctx->block->instructions.emplace_back(std::move(store));
4200    return;
4201 }
4202
4203 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4204 {
4205    /* return the previous value if dest is ever used */
4206    bool return_previous = false;
4207    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4208       return_previous = true;
4209       break;
4210    }
4211    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4212       return_previous = true;
4213       break;
4214    }
4215
4216    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4217    const struct glsl_type *type = glsl_without_array(var->type);
4218    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4219    bool is_array = glsl_sampler_type_is_array(type);
4220    Builder bld(ctx->program, ctx->block);
4221
4222    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4223    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
4224
4225    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
4226       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
4227
4228    aco_opcode buf_op, image_op;
4229    switch (instr->intrinsic) {
4230       case nir_intrinsic_image_deref_atomic_add:
4231          buf_op = aco_opcode::buffer_atomic_add;
4232          image_op = aco_opcode::image_atomic_add;
4233          break;
4234       case nir_intrinsic_image_deref_atomic_umin:
4235          buf_op = aco_opcode::buffer_atomic_umin;
4236          image_op = aco_opcode::image_atomic_umin;
4237          break;
4238       case nir_intrinsic_image_deref_atomic_imin:
4239          buf_op = aco_opcode::buffer_atomic_smin;
4240          image_op = aco_opcode::image_atomic_smin;
4241          break;
4242       case nir_intrinsic_image_deref_atomic_umax:
4243          buf_op = aco_opcode::buffer_atomic_umax;
4244          image_op = aco_opcode::image_atomic_umax;
4245          break;
4246       case nir_intrinsic_image_deref_atomic_imax:
4247          buf_op = aco_opcode::buffer_atomic_smax;
4248          image_op = aco_opcode::image_atomic_smax;
4249          break;
4250       case nir_intrinsic_image_deref_atomic_and:
4251          buf_op = aco_opcode::buffer_atomic_and;
4252          image_op = aco_opcode::image_atomic_and;
4253          break;
4254       case nir_intrinsic_image_deref_atomic_or:
4255          buf_op = aco_opcode::buffer_atomic_or;
4256          image_op = aco_opcode::image_atomic_or;
4257          break;
4258       case nir_intrinsic_image_deref_atomic_xor:
4259          buf_op = aco_opcode::buffer_atomic_xor;
4260          image_op = aco_opcode::image_atomic_xor;
4261          break;
4262       case nir_intrinsic_image_deref_atomic_exchange:
4263          buf_op = aco_opcode::buffer_atomic_swap;
4264          image_op = aco_opcode::image_atomic_swap;
4265          break;
4266       case nir_intrinsic_image_deref_atomic_comp_swap:
4267          buf_op = aco_opcode::buffer_atomic_cmpswap;
4268          image_op = aco_opcode::image_atomic_cmpswap;
4269          break;
4270       default:
4271          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
4272    }
4273
4274    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4275
4276    if (dim == GLSL_SAMPLER_DIM_BUF) {
4277       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4278       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4279       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
4280       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4281       mubuf->operands[0] = Operand(vindex);
4282       mubuf->operands[1] = Operand(resource);
4283       mubuf->operands[2] = Operand((uint32_t)0);
4284       mubuf->operands[3] = Operand(data);
4285       if (return_previous)
4286          mubuf->definitions[0] = Definition(dst);
4287       mubuf->offset = 0;
4288       mubuf->idxen = true;
4289       mubuf->glc = return_previous;
4290       mubuf->dlc = false; /* Not needed for atomics */
4291       mubuf->disable_wqm = true;
4292       mubuf->barrier = barrier_image;
4293       ctx->program->needs_exact = true;
4294       ctx->block->instructions.emplace_back(std::move(mubuf));
4295       return;
4296    }
4297
4298    Temp coords = get_image_coords(ctx, instr, type);
4299    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4300    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
4301    mimg->operands[0] = Operand(coords);
4302    mimg->operands[1] = Operand(resource);
4303    mimg->operands[2] = Operand(s4); /* no sampler */
4304    mimg->operands[3] = Operand(data);
4305    if (return_previous)
4306       mimg->definitions[0] = Definition(dst);
4307    mimg->glc = return_previous;
4308    mimg->dlc = false; /* Not needed for atomics */
4309    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4310    mimg->dmask = (1 << data.size()) - 1;
4311    mimg->unrm = true;
4312    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4313    mimg->disable_wqm = true;
4314    mimg->barrier = barrier_image;
4315    ctx->program->needs_exact = true;
4316    ctx->block->instructions.emplace_back(std::move(mimg));
4317    return;
4318 }
4319
4320 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
4321 {
4322    if (in_elements && ctx->options->chip_class == GFX8) {
4323       Builder bld(ctx->program, ctx->block);
4324
4325       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
4326       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
4327       stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
4328       stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
4329
4330       Temp size = emit_extract_vector(ctx, desc, 2, s1);
4331       size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
4332
4333       Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
4334       res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
4335       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4336
4337       // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
4338       /* idea
4339        * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
4340        * in case 12 (or 3?), we have to divide by 3:
4341        * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
4342        * use v_mul_hi_u32 with magic number to divide
4343        * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4344        * disable v_skip
4345        * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4346        */
4347
4348    } else {
4349       emit_extract_vector(ctx, desc, 2, dst);
4350    }
4351 }
4352
4353 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4354 {
4355    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4356    const struct glsl_type *type = glsl_without_array(var->type);
4357    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4358    bool is_array = glsl_sampler_type_is_array(type);
4359    Builder bld(ctx->program, ctx->block);
4360
4361    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4362       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4363       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4364    }
4365
4366    /* LOD */
4367    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4368
4369    /* Resource */
4370    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4371
4372    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4373
4374    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4375    mimg->operands[0] = Operand(lod);
4376    mimg->operands[1] = Operand(resource);
4377    unsigned& dmask = mimg->dmask;
4378    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4379    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4380    mimg->da = glsl_sampler_type_is_array(type);
4381    mimg->can_reorder = true;
4382    Definition& def = mimg->definitions[0];
4383    ctx->block->instructions.emplace_back(std::move(mimg));
4384
4385    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4386        glsl_sampler_type_is_array(type)) {
4387
4388       assert(instr->dest.ssa.num_components == 3);
4389       Temp tmp = {ctx->program->allocateId(), v3};
4390       def = Definition(tmp);
4391       emit_split_vector(ctx, tmp, 3);
4392
4393       /* divide 3rd value by 6 by multiplying with magic number */
4394       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4395       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4396
4397       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4398                  emit_extract_vector(ctx, tmp, 0, v1),
4399                  emit_extract_vector(ctx, tmp, 1, v1),
4400                  by_6);
4401
4402    } else if (ctx->options->chip_class == GFX9 &&
4403               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4404               glsl_sampler_type_is_array(type)) {
4405       assert(instr->dest.ssa.num_components == 2);
4406       def = Definition(dst);
4407       dmask = 0x5;
4408    } else {
4409       def = Definition(dst);
4410    }
4411
4412    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4413 }
4414
4415 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4416 {
4417    Builder bld(ctx->program, ctx->block);
4418    unsigned num_components = instr->num_components;
4419
4420    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4421    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4422    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4423
4424    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4425    load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc, false);
4426 }
4427
4428 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4429 {
4430    Builder bld(ctx->program, ctx->block);
4431    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4432    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4433    unsigned writemask = nir_intrinsic_write_mask(instr);
4434
4435    Temp offset;
4436    if (ctx->options->chip_class < GFX8)
4437       offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4438    else
4439       offset = get_ssa_temp(ctx, instr->src[2].ssa);
4440
4441    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4442    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4443
4444    bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4445                ctx->options->chip_class >= GFX8;
4446    if (smem)
4447       offset = bld.as_uniform(offset);
4448    bool smem_nonfs = smem && ctx->stage != fragment_fs;
4449
4450    while (writemask) {
4451       int start, count;
4452       u_bit_scan_consecutive_range(&writemask, &start, &count);
4453       if (count == 3 && smem) {
4454          writemask |= 1u << (start + 2);
4455          count = 2;
4456       }
4457       int num_bytes = count * elem_size_bytes;
4458
4459       if (num_bytes > 16) {
4460          assert(elem_size_bytes == 8);
4461          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4462          count = 2;
4463          num_bytes = 16;
4464       }
4465
4466       // TODO: check alignment of sub-dword stores
4467       // TODO: split 3 bytes. there is no store instruction for that
4468
4469       Temp write_data;
4470       if (count != instr->num_components) {
4471          emit_split_vector(ctx, data, instr->num_components);
4472          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4473          for (int i = 0; i < count; i++) {
4474             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4475             vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4476          }
4477          write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4478          vec->definitions[0] = Definition(write_data);
4479          ctx->block->instructions.emplace_back(std::move(vec));
4480       } else if (!smem && data.type() != RegType::vgpr) {
4481          assert(num_bytes % 4 == 0);
4482          write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4483       } else if (smem_nonfs && data.type() == RegType::vgpr) {
4484          assert(num_bytes % 4 == 0);
4485          write_data = bld.as_uniform(data);
4486       } else {
4487          write_data = data;
4488       }
4489
4490       aco_opcode vmem_op, smem_op;
4491       switch (num_bytes) {
4492          case 4:
4493             vmem_op = aco_opcode::buffer_store_dword;
4494             smem_op = aco_opcode::s_buffer_store_dword;
4495             break;
4496          case 8:
4497             vmem_op = aco_opcode::buffer_store_dwordx2;
4498             smem_op = aco_opcode::s_buffer_store_dwordx2;
4499             break;
4500          case 12:
4501             vmem_op = aco_opcode::buffer_store_dwordx3;
4502             smem_op = aco_opcode::last_opcode;
4503             assert(!smem);
4504             break;
4505          case 16:
4506             vmem_op = aco_opcode::buffer_store_dwordx4;
4507             smem_op = aco_opcode::s_buffer_store_dwordx4;
4508             break;
4509          default:
4510             unreachable("Store SSBO not implemented for this size.");
4511       }
4512       if (ctx->stage == fragment_fs)
4513          smem_op = aco_opcode::p_fs_buffer_store_smem;
4514
4515       if (smem) {
4516          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4517          store->operands[0] = Operand(rsrc);
4518          if (start) {
4519             Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4520                                 offset, Operand(start * elem_size_bytes));
4521             store->operands[1] = Operand(off);
4522          } else {
4523             store->operands[1] = Operand(offset);
4524          }
4525          if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4526             store->operands[1].setFixed(m0);
4527          store->operands[2] = Operand(write_data);
4528          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4529          store->dlc = false;
4530          store->disable_wqm = true;
4531          store->barrier = barrier_buffer;
4532          ctx->block->instructions.emplace_back(std::move(store));
4533          ctx->program->wb_smem_l1_on_end = true;
4534          if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4535             ctx->block->kind |= block_kind_needs_lowering;
4536             ctx->program->needs_exact = true;
4537          }
4538       } else {
4539          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4540          store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4541          store->operands[1] = Operand(rsrc);
4542          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4543          store->operands[3] = Operand(write_data);
4544          store->offset = start * elem_size_bytes;
4545          store->offen = (offset.type() == RegType::vgpr);
4546          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4547          store->dlc = false;
4548          store->disable_wqm = true;
4549          store->barrier = barrier_buffer;
4550          ctx->program->needs_exact = true;
4551          ctx->block->instructions.emplace_back(std::move(store));
4552       }
4553    }
4554 }
4555
4556 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4557 {
4558    /* return the previous value if dest is ever used */
4559    bool return_previous = false;
4560    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4561       return_previous = true;
4562       break;
4563    }
4564    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4565       return_previous = true;
4566       break;
4567    }
4568
4569    Builder bld(ctx->program, ctx->block);
4570    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4571
4572    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4573       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4574                         get_ssa_temp(ctx, instr->src[3].ssa), data);
4575
4576    Temp offset;
4577    if (ctx->options->chip_class < GFX8)
4578       offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4579    else
4580       offset = get_ssa_temp(ctx, instr->src[1].ssa);
4581
4582    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4583    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4584
4585    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4586
4587    aco_opcode op32, op64;
4588    switch (instr->intrinsic) {
4589       case nir_intrinsic_ssbo_atomic_add:
4590          op32 = aco_opcode::buffer_atomic_add;
4591          op64 = aco_opcode::buffer_atomic_add_x2;
4592          break;
4593       case nir_intrinsic_ssbo_atomic_imin:
4594          op32 = aco_opcode::buffer_atomic_smin;
4595          op64 = aco_opcode::buffer_atomic_smin_x2;
4596          break;
4597       case nir_intrinsic_ssbo_atomic_umin:
4598          op32 = aco_opcode::buffer_atomic_umin;
4599          op64 = aco_opcode::buffer_atomic_umin_x2;
4600          break;
4601       case nir_intrinsic_ssbo_atomic_imax:
4602          op32 = aco_opcode::buffer_atomic_smax;
4603          op64 = aco_opcode::buffer_atomic_smax_x2;
4604          break;
4605       case nir_intrinsic_ssbo_atomic_umax:
4606          op32 = aco_opcode::buffer_atomic_umax;
4607          op64 = aco_opcode::buffer_atomic_umax_x2;
4608          break;
4609       case nir_intrinsic_ssbo_atomic_and:
4610          op32 = aco_opcode::buffer_atomic_and;
4611          op64 = aco_opcode::buffer_atomic_and_x2;
4612          break;
4613       case nir_intrinsic_ssbo_atomic_or:
4614          op32 = aco_opcode::buffer_atomic_or;
4615          op64 = aco_opcode::buffer_atomic_or_x2;
4616          break;
4617       case nir_intrinsic_ssbo_atomic_xor:
4618          op32 = aco_opcode::buffer_atomic_xor;
4619          op64 = aco_opcode::buffer_atomic_xor_x2;
4620          break;
4621       case nir_intrinsic_ssbo_atomic_exchange:
4622          op32 = aco_opcode::buffer_atomic_swap;
4623          op64 = aco_opcode::buffer_atomic_swap_x2;
4624          break;
4625       case nir_intrinsic_ssbo_atomic_comp_swap:
4626          op32 = aco_opcode::buffer_atomic_cmpswap;
4627          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4628          break;
4629       default:
4630          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4631    }
4632    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4633    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4634    mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4635    mubuf->operands[1] = Operand(rsrc);
4636    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4637    mubuf->operands[3] = Operand(data);
4638    if (return_previous)
4639       mubuf->definitions[0] = Definition(dst);
4640    mubuf->offset = 0;
4641    mubuf->offen = (offset.type() == RegType::vgpr);
4642    mubuf->glc = return_previous;
4643    mubuf->dlc = false; /* Not needed for atomics */
4644    mubuf->disable_wqm = true;
4645    mubuf->barrier = barrier_buffer;
4646    ctx->program->needs_exact = true;
4647    ctx->block->instructions.emplace_back(std::move(mubuf));
4648 }
4649
4650 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4651
4652    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4653    Builder bld(ctx->program, ctx->block);
4654    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4655    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4656 }
4657
4658 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4659 {
4660    Builder bld(ctx->program, ctx->block);
4661    unsigned num_components = instr->num_components;
4662    unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4663
4664    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4665    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4666
4667    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4668    bool dlc = glc && ctx->options->chip_class >= GFX10;
4669    aco_opcode op;
4670    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4671       bool global = ctx->options->chip_class >= GFX9;
4672       aco_opcode op;
4673       switch (num_bytes) {
4674       case 4:
4675          op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4676          break;
4677       case 8:
4678          op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4679          break;
4680       case 12:
4681          op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4682          break;
4683       case 16:
4684          op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4685          break;
4686       default:
4687          unreachable("load_global not implemented for this size.");
4688       }
4689       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4690       flat->operands[0] = Operand(addr);
4691       flat->operands[1] = Operand(s1);
4692       flat->glc = glc;
4693       flat->dlc = dlc;
4694       flat->barrier = barrier_buffer;
4695
4696       if (dst.type() == RegType::sgpr) {
4697          Temp vec = bld.tmp(RegType::vgpr, dst.size());
4698          flat->definitions[0] = Definition(vec);
4699          ctx->block->instructions.emplace_back(std::move(flat));
4700          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4701       } else {
4702          flat->definitions[0] = Definition(dst);
4703          ctx->block->instructions.emplace_back(std::move(flat));
4704       }
4705       emit_split_vector(ctx, dst, num_components);
4706    } else {
4707       switch (num_bytes) {
4708          case 4:
4709             op = aco_opcode::s_load_dword;
4710             break;
4711          case 8:
4712             op = aco_opcode::s_load_dwordx2;
4713             break;
4714          case 12:
4715          case 16:
4716             op = aco_opcode::s_load_dwordx4;
4717             break;
4718          default:
4719             unreachable("load_global not implemented for this size.");
4720       }
4721       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4722       load->operands[0] = Operand(addr);
4723       load->operands[1] = Operand(0u);
4724       load->definitions[0] = Definition(dst);
4725       load->glc = glc;
4726       load->dlc = dlc;
4727       load->barrier = barrier_buffer;
4728       assert(ctx->options->chip_class >= GFX8 || !glc);
4729
4730       if (dst.size() == 3) {
4731          /* trim vector */
4732          Temp vec = bld.tmp(s4);
4733          load->definitions[0] = Definition(vec);
4734          ctx->block->instructions.emplace_back(std::move(load));
4735          emit_split_vector(ctx, vec, 4);
4736
4737          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4738                     emit_extract_vector(ctx, vec, 0, s1),
4739                     emit_extract_vector(ctx, vec, 1, s1),
4740                     emit_extract_vector(ctx, vec, 2, s1));
4741       } else {
4742          ctx->block->instructions.emplace_back(std::move(load));
4743       }
4744    }
4745 }
4746
4747 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4748 {
4749    Builder bld(ctx->program, ctx->block);
4750    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4751
4752    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4753    Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4754
4755    unsigned writemask = nir_intrinsic_write_mask(instr);
4756    while (writemask) {
4757       int start, count;
4758       u_bit_scan_consecutive_range(&writemask, &start, &count);
4759       unsigned num_bytes = count * elem_size_bytes;
4760
4761       Temp write_data = data;
4762       if (count != instr->num_components) {
4763          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4764          for (int i = 0; i < count; i++)
4765             vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4766          write_data = bld.tmp(RegType::vgpr, count);
4767          vec->definitions[0] = Definition(write_data);
4768          ctx->block->instructions.emplace_back(std::move(vec));
4769       }
4770
4771       unsigned offset = start * elem_size_bytes;
4772       if (offset > 0 && ctx->options->chip_class < GFX9) {
4773          Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4774          Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4775          Temp carry = bld.tmp(bld.lm);
4776          bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4777
4778          bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4779                   Operand(offset), addr0);
4780          bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
4781                   Operand(0u), addr1,
4782                   carry).def(1).setHint(vcc);
4783
4784          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4785
4786          offset = 0;
4787       }
4788
4789       bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4790       bool global = ctx->options->chip_class >= GFX9;
4791       aco_opcode op;
4792       switch (num_bytes) {
4793       case 4:
4794          op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4795          break;
4796       case 8:
4797          op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4798          break;
4799       case 12:
4800          op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4801          break;
4802       case 16:
4803          op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4804          break;
4805       default:
4806          unreachable("store_global not implemented for this size.");
4807       }
4808       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4809       flat->operands[0] = Operand(addr);
4810       flat->operands[1] = Operand(s1);
4811       flat->operands[2] = Operand(data);
4812       flat->glc = glc;
4813       flat->dlc = false;
4814       flat->offset = offset;
4815       flat->disable_wqm = true;
4816       flat->barrier = barrier_buffer;
4817       ctx->program->needs_exact = true;
4818       ctx->block->instructions.emplace_back(std::move(flat));
4819    }
4820 }
4821
4822 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4823 {
4824    /* return the previous value if dest is ever used */
4825    bool return_previous = false;
4826    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4827       return_previous = true;
4828       break;
4829    }
4830    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4831       return_previous = true;
4832       break;
4833    }
4834
4835    Builder bld(ctx->program, ctx->block);
4836    Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4837    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4838
4839    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
4840       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4841                         get_ssa_temp(ctx, instr->src[2].ssa), data);
4842
4843    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4844
4845    bool global = ctx->options->chip_class >= GFX9;
4846    aco_opcode op32, op64;
4847    switch (instr->intrinsic) {
4848       case nir_intrinsic_global_atomic_add:
4849          op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
4850          op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
4851          break;
4852       case nir_intrinsic_global_atomic_imin:
4853          op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
4854          op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
4855          break;
4856       case nir_intrinsic_global_atomic_umin:
4857          op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
4858          op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
4859          break;
4860       case nir_intrinsic_global_atomic_imax:
4861          op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
4862          op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
4863          break;
4864       case nir_intrinsic_global_atomic_umax:
4865          op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
4866          op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
4867          break;
4868       case nir_intrinsic_global_atomic_and:
4869          op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
4870          op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
4871          break;
4872       case nir_intrinsic_global_atomic_or:
4873          op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
4874          op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
4875          break;
4876       case nir_intrinsic_global_atomic_xor:
4877          op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
4878          op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
4879          break;
4880       case nir_intrinsic_global_atomic_exchange:
4881          op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
4882          op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
4883          break;
4884       case nir_intrinsic_global_atomic_comp_swap:
4885          op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
4886          op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
4887          break;
4888       default:
4889          unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
4890    }
4891    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4892    aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
4893    flat->operands[0] = Operand(addr);
4894    flat->operands[1] = Operand(s1);
4895    flat->operands[2] = Operand(data);
4896    if (return_previous)
4897       flat->definitions[0] = Definition(dst);
4898    flat->glc = return_previous;
4899    flat->dlc = false; /* Not needed for atomics */
4900    flat->offset = 0;
4901    flat->disable_wqm = true;
4902    flat->barrier = barrier_buffer;
4903    ctx->program->needs_exact = true;
4904    ctx->block->instructions.emplace_back(std::move(flat));
4905 }
4906
4907 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4908    Builder bld(ctx->program, ctx->block);
4909    switch(instr->intrinsic) {
4910       case nir_intrinsic_group_memory_barrier:
4911       case nir_intrinsic_memory_barrier:
4912          bld.barrier(aco_opcode::p_memory_barrier_all);
4913          break;
4914       case nir_intrinsic_memory_barrier_atomic_counter:
4915          bld.barrier(aco_opcode::p_memory_barrier_atomic);
4916          break;
4917       case nir_intrinsic_memory_barrier_buffer:
4918          bld.barrier(aco_opcode::p_memory_barrier_buffer);
4919          break;
4920       case nir_intrinsic_memory_barrier_image:
4921          bld.barrier(aco_opcode::p_memory_barrier_image);
4922          break;
4923       case nir_intrinsic_memory_barrier_shared:
4924          bld.barrier(aco_opcode::p_memory_barrier_shared);
4925          break;
4926       default:
4927          unreachable("Unimplemented memory barrier intrinsic");
4928          break;
4929    }
4930 }
4931
4932 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4933 {
4934    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4935    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4936    assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4937    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4938    Builder bld(ctx->program, ctx->block);
4939
4940    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4941    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4942    load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
4943 }
4944
4945 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4946 {
4947    unsigned writemask = nir_intrinsic_write_mask(instr);
4948    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4949    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4950    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4951    assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4952
4953    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4954    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
4955 }
4956
4957 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4958 {
4959    unsigned offset = nir_intrinsic_base(instr);
4960    Operand m = load_lds_size_m0(ctx);
4961    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4962    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4963
4964    unsigned num_operands = 3;
4965    aco_opcode op32, op64, op32_rtn, op64_rtn;
4966    switch(instr->intrinsic) {
4967       case nir_intrinsic_shared_atomic_add:
4968          op32 = aco_opcode::ds_add_u32;
4969          op64 = aco_opcode::ds_add_u64;
4970          op32_rtn = aco_opcode::ds_add_rtn_u32;
4971          op64_rtn = aco_opcode::ds_add_rtn_u64;
4972          break;
4973       case nir_intrinsic_shared_atomic_imin:
4974          op32 = aco_opcode::ds_min_i32;
4975          op64 = aco_opcode::ds_min_i64;
4976          op32_rtn = aco_opcode::ds_min_rtn_i32;
4977          op64_rtn = aco_opcode::ds_min_rtn_i64;
4978          break;
4979       case nir_intrinsic_shared_atomic_umin:
4980          op32 = aco_opcode::ds_min_u32;
4981          op64 = aco_opcode::ds_min_u64;
4982          op32_rtn = aco_opcode::ds_min_rtn_u32;
4983          op64_rtn = aco_opcode::ds_min_rtn_u64;
4984          break;
4985       case nir_intrinsic_shared_atomic_imax:
4986          op32 = aco_opcode::ds_max_i32;
4987          op64 = aco_opcode::ds_max_i64;
4988          op32_rtn = aco_opcode::ds_max_rtn_i32;
4989          op64_rtn = aco_opcode::ds_max_rtn_i64;
4990          break;
4991       case nir_intrinsic_shared_atomic_umax:
4992          op32 = aco_opcode::ds_max_u32;
4993          op64 = aco_opcode::ds_max_u64;
4994          op32_rtn = aco_opcode::ds_max_rtn_u32;
4995          op64_rtn = aco_opcode::ds_max_rtn_u64;
4996          break;
4997       case nir_intrinsic_shared_atomic_and:
4998          op32 = aco_opcode::ds_and_b32;
4999          op64 = aco_opcode::ds_and_b64;
5000          op32_rtn = aco_opcode::ds_and_rtn_b32;
5001          op64_rtn = aco_opcode::ds_and_rtn_b64;
5002          break;
5003       case nir_intrinsic_shared_atomic_or:
5004          op32 = aco_opcode::ds_or_b32;
5005          op64 = aco_opcode::ds_or_b64;
5006          op32_rtn = aco_opcode::ds_or_rtn_b32;
5007          op64_rtn = aco_opcode::ds_or_rtn_b64;
5008          break;
5009       case nir_intrinsic_shared_atomic_xor:
5010          op32 = aco_opcode::ds_xor_b32;
5011          op64 = aco_opcode::ds_xor_b64;
5012          op32_rtn = aco_opcode::ds_xor_rtn_b32;
5013          op64_rtn = aco_opcode::ds_xor_rtn_b64;
5014          break;
5015       case nir_intrinsic_shared_atomic_exchange:
5016          op32 = aco_opcode::ds_write_b32;
5017          op64 = aco_opcode::ds_write_b64;
5018          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
5019          op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
5020          break;
5021       case nir_intrinsic_shared_atomic_comp_swap:
5022          op32 = aco_opcode::ds_cmpst_b32;
5023          op64 = aco_opcode::ds_cmpst_b64;
5024          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
5025          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
5026          num_operands = 4;
5027          break;
5028       default:
5029          unreachable("Unhandled shared atomic intrinsic");
5030    }
5031
5032    /* return the previous value if dest is ever used */
5033    bool return_previous = false;
5034    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5035       return_previous = true;
5036       break;
5037    }
5038    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5039       return_previous = true;
5040       break;
5041    }
5042
5043    aco_opcode op;
5044    if (data.size() == 1) {
5045       assert(instr->dest.ssa.bit_size == 32);
5046       op = return_previous ? op32_rtn : op32;
5047    } else {
5048       assert(instr->dest.ssa.bit_size == 64);
5049       op = return_previous ? op64_rtn : op64;
5050    }
5051
5052    if (offset > 65535) {
5053       Builder bld(ctx->program, ctx->block);
5054       address = bld.vadd32(bld.def(v1), Operand(offset), address);
5055       offset = 0;
5056    }
5057
5058    aco_ptr<DS_instruction> ds;
5059    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
5060    ds->operands[0] = Operand(address);
5061    ds->operands[1] = Operand(data);
5062    if (num_operands == 4)
5063       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
5064    ds->operands[num_operands - 1] = m;
5065    ds->offset0 = offset;
5066    if (return_previous)
5067       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
5068    ctx->block->instructions.emplace_back(std::move(ds));
5069 }
5070
5071 Temp get_scratch_resource(isel_context *ctx)
5072 {
5073    Builder bld(ctx->program, ctx->block);
5074    Temp scratch_addr = ctx->program->private_segment_buffer;
5075    if (ctx->stage != compute_cs)
5076       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
5077
5078    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
5079                         S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
5080
5081    if (ctx->program->chip_class >= GFX10) {
5082       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5083                    S_008F0C_OOB_SELECT(3) |
5084                    S_008F0C_RESOURCE_LEVEL(1);
5085    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
5086       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5087                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5088    }
5089
5090    /* older generations need element size = 16 bytes. element size removed in GFX9 */
5091    if (ctx->program->chip_class <= GFX8)
5092       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
5093
5094    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
5095 }
5096
5097 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
5098    assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
5099    Builder bld(ctx->program, ctx->block);
5100    Temp rsrc = get_scratch_resource(ctx);
5101    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5102    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5103
5104    aco_opcode op;
5105    switch (dst.size()) {
5106       case 1:
5107          op = aco_opcode::buffer_load_dword;
5108          break;
5109       case 2:
5110          op = aco_opcode::buffer_load_dwordx2;
5111          break;
5112       case 3:
5113          op = aco_opcode::buffer_load_dwordx3;
5114          break;
5115       case 4:
5116          op = aco_opcode::buffer_load_dwordx4;
5117          break;
5118       case 6:
5119       case 8: {
5120          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
5121          Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
5122                                 bld.def(v4), offset, rsrc,
5123                                 ctx->program->scratch_offset, 0, true);
5124          Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
5125                                                   aco_opcode::buffer_load_dwordx4,
5126                                 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
5127                                 offset, rsrc, ctx->program->scratch_offset, 16, true);
5128          emit_split_vector(ctx, lower, 2);
5129          elems[0] = emit_extract_vector(ctx, lower, 0, v2);
5130          elems[1] = emit_extract_vector(ctx, lower, 1, v2);
5131          if (dst.size() == 8) {
5132             emit_split_vector(ctx, upper, 2);
5133             elems[2] = emit_extract_vector(ctx, upper, 0, v2);
5134             elems[3] = emit_extract_vector(ctx, upper, 1, v2);
5135          } else {
5136             elems[2] = upper;
5137          }
5138
5139          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
5140                                                                          Format::PSEUDO, dst.size() / 2, 1)};
5141          for (unsigned i = 0; i < dst.size() / 2; i++)
5142             vec->operands[i] = Operand(elems[i]);
5143          vec->definitions[0] = Definition(dst);
5144          bld.insert(std::move(vec));
5145          ctx->allocated_vec.emplace(dst.id(), elems);
5146          return;
5147       }
5148       default:
5149          unreachable("Wrong dst size for nir_intrinsic_load_scratch");
5150    }
5151
5152    bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true);
5153    emit_split_vector(ctx, dst, instr->num_components);
5154 }
5155
5156 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
5157    assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
5158    Builder bld(ctx->program, ctx->block);
5159    Temp rsrc = get_scratch_resource(ctx);
5160    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5161    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5162
5163    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
5164    unsigned writemask = nir_intrinsic_write_mask(instr);
5165
5166    while (writemask) {
5167       int start, count;
5168       u_bit_scan_consecutive_range(&writemask, &start, &count);
5169       int num_bytes = count * elem_size_bytes;
5170
5171       if (num_bytes > 16) {
5172          assert(elem_size_bytes == 8);
5173          writemask |= (((count - 2) << 1) - 1) << (start + 2);
5174          count = 2;
5175          num_bytes = 16;
5176       }
5177
5178       // TODO: check alignment of sub-dword stores
5179       // TODO: split 3 bytes. there is no store instruction for that
5180
5181       Temp write_data;
5182       if (count != instr->num_components) {
5183          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5184          for (int i = 0; i < count; i++) {
5185             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
5186             vec->operands[i] = Operand(elem);
5187          }
5188          write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
5189          vec->definitions[0] = Definition(write_data);
5190          ctx->block->instructions.emplace_back(std::move(vec));
5191       } else {
5192          write_data = data;
5193       }
5194
5195       aco_opcode op;
5196       switch (num_bytes) {
5197          case 4:
5198             op = aco_opcode::buffer_store_dword;
5199             break;
5200          case 8:
5201             op = aco_opcode::buffer_store_dwordx2;
5202             break;
5203          case 12:
5204             op = aco_opcode::buffer_store_dwordx3;
5205             break;
5206          case 16:
5207             op = aco_opcode::buffer_store_dwordx4;
5208             break;
5209          default:
5210             unreachable("Invalid data size for nir_intrinsic_store_scratch.");
5211       }
5212
5213       bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
5214    }
5215 }
5216
5217 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
5218    uint8_t log2_ps_iter_samples;
5219    if (ctx->program->info->ps.force_persample) {
5220       log2_ps_iter_samples =
5221          util_logbase2(ctx->options->key.fs.num_samples);
5222    } else {
5223       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
5224    }
5225
5226    /* The bit pattern matches that used by fixed function fragment
5227     * processing. */
5228    static const unsigned ps_iter_masks[] = {
5229       0xffff, /* not used */
5230       0x5555,
5231       0x1111,
5232       0x0101,
5233       0x0001,
5234    };
5235    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
5236
5237    Builder bld(ctx->program, ctx->block);
5238
5239    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
5240                              get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
5241    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
5242    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
5243    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5244    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
5245 }
5246
5247 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
5248 {
5249    Builder bld(ctx->program, ctx->block);
5250
5251    if (cluster_size == 1) {
5252       return src;
5253    } if (op == nir_op_iand && cluster_size == 4) {
5254       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
5255       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
5256       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
5257                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
5258    } else if (op == nir_op_ior && cluster_size == 4) {
5259       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
5260       return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
5261                       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
5262    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
5263       //subgroupAnd(val) -> (exec & ~val) == 0
5264       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
5265       return bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
5266    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
5267       //subgroupOr(val) -> (val & exec) != 0
5268       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
5269       return bool_to_vector_condition(ctx, tmp);
5270    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
5271       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5272       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5273       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
5274       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5275       return bool_to_vector_condition(ctx, tmp);
5276    } else {
5277       //subgroupClustered{And,Or,Xor}(val, n) ->
5278       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ;  just v_mbcnt_lo_u32_b32 on wave32
5279       //cluster_offset = ~(n - 1) & lane_id
5280       //cluster_mask = ((1 << n) - 1)
5281       //subgroupClusteredAnd():
5282       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5283       //subgroupClusteredOr():
5284       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
5285       //subgroupClusteredXor():
5286       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5287       Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
5288       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5289
5290       Temp tmp;
5291       if (op == nir_op_iand)
5292          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5293       else
5294          tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5295
5296       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5297
5298       if (ctx->program->chip_class <= GFX7)
5299          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
5300       else if (ctx->program->wave_size == 64)
5301          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5302       else
5303          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
5304       tmp = emit_extract_vector(ctx, tmp, 0, v1);
5305       if (cluster_mask != 0xffffffff)
5306          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5307
5308       Definition cmp_def = Definition();
5309       if (op == nir_op_iand) {
5310          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
5311       } else if (op == nir_op_ior) {
5312          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
5313       } else if (op == nir_op_ixor) {
5314          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5315                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5316          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
5317       }
5318       cmp_def.setHint(vcc);
5319       return cmp_def.getTemp();
5320    }
5321 }
5322
5323 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5324 {
5325    Builder bld(ctx->program, ctx->block);
5326
5327    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5328    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5329    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5330    Temp tmp;
5331    if (op == nir_op_iand)
5332       tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
5333    else
5334       tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
5335
5336    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5337    Temp lo = lohi.def(0).getTemp();
5338    Temp hi = lohi.def(1).getTemp();
5339    Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi));
5340
5341    Definition cmp_def = Definition();
5342    if (op == nir_op_iand)
5343       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
5344    else if (op == nir_op_ior)
5345       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
5346    else if (op == nir_op_ixor)
5347       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
5348                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5349    cmp_def.setHint(vcc);
5350    return cmp_def.getTemp();
5351 }
5352
5353 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5354 {
5355    Builder bld(ctx->program, ctx->block);
5356
5357    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5358    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5359    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5360    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5361    if (op == nir_op_iand)
5362       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
5363    else if (op == nir_op_ior)
5364       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
5365    else if (op == nir_op_ixor)
5366       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
5367
5368    assert(false);
5369    return Temp();
5370 }
5371
5372 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5373 {
5374    Builder bld(ctx->program, ctx->block);
5375    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5376    if (src.regClass().type() == RegType::vgpr) {
5377       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5378    } else if (src.regClass() == s1) {
5379       bld.sop1(aco_opcode::s_mov_b32, dst, src);
5380    } else if (src.regClass() == s2) {
5381       bld.sop1(aco_opcode::s_mov_b64, dst, src);
5382    } else {
5383       fprintf(stderr, "Unimplemented NIR instr bit size: ");
5384       nir_print_instr(&instr->instr, stderr);
5385       fprintf(stderr, "\n");
5386    }
5387 }
5388
5389 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5390 {
5391    Builder bld(ctx->program, ctx->block);
5392    Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
5393    Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
5394    Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
5395
5396    Temp ddx_1, ddx_2, ddy_1, ddy_2;
5397    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
5398    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
5399    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
5400
5401    /* Build DD X/Y */
5402    if (ctx->program->chip_class >= GFX8) {
5403       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
5404       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
5405       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
5406       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
5407       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
5408       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
5409    } else {
5410       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
5411       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
5412       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
5413       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
5414       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
5415       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
5416       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
5417       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
5418       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
5419       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
5420    }
5421
5422    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5423    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5424    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5425    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5426    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5427    Temp wqm1 = bld.tmp(v1);
5428    emit_wqm(ctx, tmp1, wqm1, true);
5429    Temp wqm2 = bld.tmp(v1);
5430    emit_wqm(ctx, tmp2, wqm2, true);
5431    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5432    return;
5433 }
5434
5435 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5436 {
5437    Builder bld(ctx->program, ctx->block);
5438    switch(instr->intrinsic) {
5439    case nir_intrinsic_load_barycentric_sample:
5440    case nir_intrinsic_load_barycentric_pixel:
5441    case nir_intrinsic_load_barycentric_centroid: {
5442       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5443       Temp bary = Temp(0, s2);
5444       switch (mode) {
5445       case INTERP_MODE_SMOOTH:
5446       case INTERP_MODE_NONE:
5447          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
5448             bary = get_arg(ctx, ctx->args->ac.persp_center);
5449          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
5450             bary = ctx->persp_centroid;
5451          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
5452             bary = get_arg(ctx, ctx->args->ac.persp_sample);
5453          break;
5454       case INTERP_MODE_NOPERSPECTIVE:
5455          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
5456             bary = get_arg(ctx, ctx->args->ac.linear_center);
5457          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
5458             bary = ctx->linear_centroid;
5459          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
5460             bary = get_arg(ctx, ctx->args->ac.linear_sample);
5461          break;
5462       default:
5463          break;
5464       }
5465       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5466       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
5467       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
5468       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5469                  Operand(p1), Operand(p2));
5470       emit_split_vector(ctx, dst, 2);
5471       break;
5472    }
5473    case nir_intrinsic_load_barycentric_at_sample: {
5474       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5475       switch (ctx->options->key.fs.num_samples) {
5476          case 2: sample_pos_offset += 1 << 3; break;
5477          case 4: sample_pos_offset += 3 << 3; break;
5478          case 8: sample_pos_offset += 7 << 3; break;
5479          default: break;
5480       }
5481       Temp sample_pos;
5482       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5483       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5484       Temp private_segment_buffer = ctx->program->private_segment_buffer;
5485       if (addr.type() == RegType::sgpr) {
5486          Operand offset;
5487          if (const_addr) {
5488             sample_pos_offset += const_addr->u32 << 3;
5489             offset = Operand(sample_pos_offset);
5490          } else if (ctx->options->chip_class >= GFX9) {
5491             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5492          } else {
5493             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5494             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5495          }
5496          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(offset));
5497
5498       } else if (ctx->options->chip_class >= GFX9) {
5499          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5500          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
5501       } else {
5502          /* addr += private_segment_buffer + sample_pos_offset */
5503          Temp tmp0 = bld.tmp(s1);
5504          Temp tmp1 = bld.tmp(s1);
5505          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
5506          Definition scc_tmp = bld.def(s1, scc);
5507          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5508          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
5509          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5510          Temp pck0 = bld.tmp(v1);
5511          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5512          tmp1 = as_vgpr(ctx, tmp1);
5513          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
5514          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5515
5516          /* sample_pos = flat_load_dwordx2 addr */
5517          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5518       }
5519
5520       /* sample_pos -= 0.5 */
5521       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5522       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5523       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5524       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5525       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5526
5527       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5528       break;
5529    }
5530    case nir_intrinsic_load_barycentric_at_offset: {
5531       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5532       RegClass rc = RegClass(offset.type(), 1);
5533       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5534       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5535       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5536       break;
5537    }
5538    case nir_intrinsic_load_front_face: {
5539       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5540                Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
5541       break;
5542    }
5543    case nir_intrinsic_load_view_index:
5544    case nir_intrinsic_load_layer_id: {
5545       if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5546          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5547          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
5548          break;
5549       }
5550
5551       unsigned idx = nir_intrinsic_base(instr);
5552       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5553                  Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
5554       break;
5555    }
5556    case nir_intrinsic_load_frag_coord: {
5557       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5558       break;
5559    }
5560    case nir_intrinsic_load_sample_pos: {
5561       Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
5562       Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
5563       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5564                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5565                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5566       break;
5567    }
5568    case nir_intrinsic_load_interpolated_input:
5569       visit_load_interpolated_input(ctx, instr);
5570       break;
5571    case nir_intrinsic_store_output:
5572       visit_store_output(ctx, instr);
5573       break;
5574    case nir_intrinsic_load_input:
5575       visit_load_input(ctx, instr);
5576       break;
5577    case nir_intrinsic_load_ubo:
5578       visit_load_ubo(ctx, instr);
5579       break;
5580    case nir_intrinsic_load_push_constant:
5581       visit_load_push_constant(ctx, instr);
5582       break;
5583    case nir_intrinsic_load_constant:
5584       visit_load_constant(ctx, instr);
5585       break;
5586    case nir_intrinsic_vulkan_resource_index:
5587       visit_load_resource(ctx, instr);
5588       break;
5589    case nir_intrinsic_discard:
5590       visit_discard(ctx, instr);
5591       break;
5592    case nir_intrinsic_discard_if:
5593       visit_discard_if(ctx, instr);
5594       break;
5595    case nir_intrinsic_load_shared:
5596       visit_load_shared(ctx, instr);
5597       break;
5598    case nir_intrinsic_store_shared:
5599       visit_store_shared(ctx, instr);
5600       break;
5601    case nir_intrinsic_shared_atomic_add:
5602    case nir_intrinsic_shared_atomic_imin:
5603    case nir_intrinsic_shared_atomic_umin:
5604    case nir_intrinsic_shared_atomic_imax:
5605    case nir_intrinsic_shared_atomic_umax:
5606    case nir_intrinsic_shared_atomic_and:
5607    case nir_intrinsic_shared_atomic_or:
5608    case nir_intrinsic_shared_atomic_xor:
5609    case nir_intrinsic_shared_atomic_exchange:
5610    case nir_intrinsic_shared_atomic_comp_swap:
5611       visit_shared_atomic(ctx, instr);
5612       break;
5613    case nir_intrinsic_image_deref_load:
5614       visit_image_load(ctx, instr);
5615       break;
5616    case nir_intrinsic_image_deref_store:
5617       visit_image_store(ctx, instr);
5618       break;
5619    case nir_intrinsic_image_deref_atomic_add:
5620    case nir_intrinsic_image_deref_atomic_umin:
5621    case nir_intrinsic_image_deref_atomic_imin:
5622    case nir_intrinsic_image_deref_atomic_umax:
5623    case nir_intrinsic_image_deref_atomic_imax:
5624    case nir_intrinsic_image_deref_atomic_and:
5625    case nir_intrinsic_image_deref_atomic_or:
5626    case nir_intrinsic_image_deref_atomic_xor:
5627    case nir_intrinsic_image_deref_atomic_exchange:
5628    case nir_intrinsic_image_deref_atomic_comp_swap:
5629       visit_image_atomic(ctx, instr);
5630       break;
5631    case nir_intrinsic_image_deref_size:
5632       visit_image_size(ctx, instr);
5633       break;
5634    case nir_intrinsic_load_ssbo:
5635       visit_load_ssbo(ctx, instr);
5636       break;
5637    case nir_intrinsic_store_ssbo:
5638       visit_store_ssbo(ctx, instr);
5639       break;
5640    case nir_intrinsic_load_global:
5641       visit_load_global(ctx, instr);
5642       break;
5643    case nir_intrinsic_store_global:
5644       visit_store_global(ctx, instr);
5645       break;
5646    case nir_intrinsic_global_atomic_add:
5647    case nir_intrinsic_global_atomic_imin:
5648    case nir_intrinsic_global_atomic_umin:
5649    case nir_intrinsic_global_atomic_imax:
5650    case nir_intrinsic_global_atomic_umax:
5651    case nir_intrinsic_global_atomic_and:
5652    case nir_intrinsic_global_atomic_or:
5653    case nir_intrinsic_global_atomic_xor:
5654    case nir_intrinsic_global_atomic_exchange:
5655    case nir_intrinsic_global_atomic_comp_swap:
5656       visit_global_atomic(ctx, instr);
5657       break;
5658    case nir_intrinsic_ssbo_atomic_add:
5659    case nir_intrinsic_ssbo_atomic_imin:
5660    case nir_intrinsic_ssbo_atomic_umin:
5661    case nir_intrinsic_ssbo_atomic_imax:
5662    case nir_intrinsic_ssbo_atomic_umax:
5663    case nir_intrinsic_ssbo_atomic_and:
5664    case nir_intrinsic_ssbo_atomic_or:
5665    case nir_intrinsic_ssbo_atomic_xor:
5666    case nir_intrinsic_ssbo_atomic_exchange:
5667    case nir_intrinsic_ssbo_atomic_comp_swap:
5668       visit_atomic_ssbo(ctx, instr);
5669       break;
5670    case nir_intrinsic_load_scratch:
5671       visit_load_scratch(ctx, instr);
5672       break;
5673    case nir_intrinsic_store_scratch:
5674       visit_store_scratch(ctx, instr);
5675       break;
5676    case nir_intrinsic_get_buffer_size:
5677       visit_get_buffer_size(ctx, instr);
5678       break;
5679    case nir_intrinsic_barrier: {
5680       unsigned* bsize = ctx->program->info->cs.block_size;
5681       unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5682       if (workgroup_size > ctx->program->wave_size)
5683          bld.sopp(aco_opcode::s_barrier);
5684       break;
5685    }
5686    case nir_intrinsic_group_memory_barrier:
5687    case nir_intrinsic_memory_barrier:
5688    case nir_intrinsic_memory_barrier_atomic_counter:
5689    case nir_intrinsic_memory_barrier_buffer:
5690    case nir_intrinsic_memory_barrier_image:
5691    case nir_intrinsic_memory_barrier_shared:
5692       emit_memory_barrier(ctx, instr);
5693       break;
5694    case nir_intrinsic_load_num_work_groups: {
5695       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5696       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
5697       emit_split_vector(ctx, dst, 3);
5698       break;
5699    }
5700    case nir_intrinsic_load_local_invocation_id: {
5701       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5702       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
5703       emit_split_vector(ctx, dst, 3);
5704       break;
5705    }
5706    case nir_intrinsic_load_work_group_id: {
5707       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5708       struct ac_arg *args = ctx->args->ac.workgroup_ids;
5709       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5710                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
5711                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
5712                  args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
5713       emit_split_vector(ctx, dst, 3);
5714       break;
5715    }
5716    case nir_intrinsic_load_local_invocation_index: {
5717       Temp id = emit_mbcnt(ctx, bld.def(v1));
5718
5719       /* The tg_size bits [6:11] contain the subgroup id,
5720        * we need this multiplied by the wave size, and then OR the thread id to it.
5721        */
5722       if (ctx->program->wave_size == 64) {
5723          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
5724          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
5725                                 get_arg(ctx, ctx->args->ac.tg_size));
5726          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5727       } else {
5728          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR  */
5729          Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
5730                                 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
5731          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
5732       }
5733       break;
5734    }
5735    case nir_intrinsic_load_subgroup_id: {
5736       if (ctx->stage == compute_cs) {
5737          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
5738                   get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
5739       } else {
5740          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5741       }
5742       break;
5743    }
5744    case nir_intrinsic_load_subgroup_invocation: {
5745       emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
5746       break;
5747    }
5748    case nir_intrinsic_load_num_subgroups: {
5749       if (ctx->stage == compute_cs)
5750          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
5751                   get_arg(ctx, ctx->args->ac.tg_size));
5752       else
5753          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5754       break;
5755    }
5756    case nir_intrinsic_ballot: {
5757       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5758       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5759       Definition tmp = bld.def(dst.regClass());
5760       Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass());
5761       if (instr->src[0].ssa->bit_size == 1) {
5762          assert(src.regClass() == bld.lm);
5763          bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
5764       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5765          bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
5766       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5767          bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
5768       } else {
5769          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5770          nir_print_instr(&instr->instr, stderr);
5771          fprintf(stderr, "\n");
5772       }
5773       if (dst.size() != bld.lm.size()) {
5774          /* Wave32 with ballot size set to 64 */
5775          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
5776       }
5777       emit_wqm(ctx, tmp.getTemp(), dst);
5778       break;
5779    }
5780    case nir_intrinsic_shuffle:
5781    case nir_intrinsic_read_invocation: {
5782       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5783       if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
5784          emit_uniform_subgroup(ctx, instr, src);
5785       } else {
5786          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5787          if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
5788             tid = bld.as_uniform(tid);
5789          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5790          if (src.regClass() == v1) {
5791             emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
5792          } else if (src.regClass() == v2) {
5793             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5794             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5795             lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
5796             hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
5797             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5798             emit_split_vector(ctx, dst, 2);
5799          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
5800             assert(src.regClass() == bld.lm);
5801             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
5802             bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
5803          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
5804             assert(src.regClass() == bld.lm);
5805             Temp tmp;
5806             if (ctx->program->chip_class <= GFX7)
5807                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
5808             else if (ctx->program->wave_size == 64)
5809                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5810             else
5811                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
5812             tmp = emit_extract_vector(ctx, tmp, 0, v1);
5813             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5814             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
5815          } else {
5816             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5817             nir_print_instr(&instr->instr, stderr);
5818             fprintf(stderr, "\n");
5819          }
5820       }
5821       break;
5822    }
5823    case nir_intrinsic_load_sample_id: {
5824       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5825                get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
5826       break;
5827    }
5828    case nir_intrinsic_load_sample_mask_in: {
5829       visit_load_sample_mask_in(ctx, instr);
5830       break;
5831    }
5832    case nir_intrinsic_read_first_invocation: {
5833       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5834       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5835       if (src.regClass() == v1) {
5836          emit_wqm(ctx,
5837                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5838                   dst);
5839       } else if (src.regClass() == v2) {
5840          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5841          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5842          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5843          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5844          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5845          emit_split_vector(ctx, dst, 2);
5846       } else if (instr->dest.ssa.bit_size == 1) {
5847          assert(src.regClass() == bld.lm);
5848          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
5849                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
5850          bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
5851       } else if (src.regClass() == s1) {
5852          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5853       } else if (src.regClass() == s2) {
5854          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5855       } else {
5856          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5857          nir_print_instr(&instr->instr, stderr);
5858          fprintf(stderr, "\n");
5859       }
5860       break;
5861    }
5862    case nir_intrinsic_vote_all: {
5863       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5864       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5865       assert(src.regClass() == bld.lm);
5866       assert(dst.regClass() == bld.lm);
5867
5868       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
5869       Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(0u), Operand(-1u), bld.scc(tmp));
5870       emit_wqm(ctx, val, dst);
5871       break;
5872    }
5873    case nir_intrinsic_vote_any: {
5874       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5875       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5876       assert(src.regClass() == bld.lm);
5877       assert(dst.regClass() == bld.lm);
5878
5879       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
5880       Temp val = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), Operand(0u), bld.scc(tmp));
5881       emit_wqm(ctx, val, dst);
5882       break;
5883    }
5884    case nir_intrinsic_reduce:
5885    case nir_intrinsic_inclusive_scan:
5886    case nir_intrinsic_exclusive_scan: {
5887       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5888       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5889       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5890       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5891          nir_intrinsic_cluster_size(instr) : 0;
5892       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
5893
5894       if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5895          emit_uniform_subgroup(ctx, instr, src);
5896       } else if (instr->dest.ssa.bit_size == 1) {
5897          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5898             op = nir_op_iand;
5899          else if (op == nir_op_iadd)
5900             op = nir_op_ixor;
5901          else if (op == nir_op_umax || op == nir_op_imax)
5902             op = nir_op_ior;
5903          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5904
5905          switch (instr->intrinsic) {
5906          case nir_intrinsic_reduce:
5907             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5908             break;
5909          case nir_intrinsic_exclusive_scan:
5910             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5911             break;
5912          case nir_intrinsic_inclusive_scan:
5913             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5914             break;
5915          default:
5916             assert(false);
5917          }
5918       } else if (cluster_size == 1) {
5919          bld.copy(Definition(dst), src);
5920       } else {
5921          src = as_vgpr(ctx, src);
5922
5923          ReduceOp reduce_op;
5924          switch (op) {
5925          #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5926             CASE(iadd)
5927             CASE(imul)
5928             CASE(fadd)
5929             CASE(fmul)
5930             CASE(imin)
5931             CASE(umin)
5932             CASE(fmin)
5933             CASE(imax)
5934             CASE(umax)
5935             CASE(fmax)
5936             CASE(iand)
5937             CASE(ior)
5938             CASE(ixor)
5939             default:
5940                unreachable("unknown reduction op");
5941          #undef CASE
5942          }
5943
5944          aco_opcode aco_op;
5945          switch (instr->intrinsic) {
5946             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5947             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5948             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5949             default:
5950                unreachable("unknown reduce intrinsic");
5951          }
5952
5953          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5954          reduce->operands[0] = Operand(src);
5955          // filled in by aco_reduce_assign.cpp, used internally as part of the
5956          // reduce sequence
5957          assert(dst.size() == 1 || dst.size() == 2);
5958          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5959          reduce->operands[2] = Operand(v1.as_linear());
5960
5961          Temp tmp_dst = bld.tmp(dst.regClass());
5962          reduce->definitions[0] = Definition(tmp_dst);
5963          reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
5964          reduce->definitions[2] = Definition();
5965          reduce->definitions[3] = Definition(scc, s1);
5966          reduce->definitions[4] = Definition();
5967          reduce->reduce_op = reduce_op;
5968          reduce->cluster_size = cluster_size;
5969          ctx->block->instructions.emplace_back(std::move(reduce));
5970
5971          emit_wqm(ctx, tmp_dst, dst);
5972       }
5973       break;
5974    }
5975    case nir_intrinsic_quad_broadcast: {
5976       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5977       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5978          emit_uniform_subgroup(ctx, instr, src);
5979       } else {
5980          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5981          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5982          uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
5983
5984          if (instr->dest.ssa.bit_size == 1) {
5985             assert(src.regClass() == bld.lm);
5986             assert(dst.regClass() == bld.lm);
5987             uint32_t half_mask = 0x11111111u << lane;
5988             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5989             Temp tmp = bld.tmp(bld.lm);
5990             bld.sop1(Builder::s_wqm, Definition(tmp),
5991                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
5992                               bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
5993             emit_wqm(ctx, tmp, dst);
5994          } else if (instr->dest.ssa.bit_size == 32) {
5995             if (ctx->program->chip_class >= GFX8)
5996                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
5997             else
5998                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
5999          } else if (instr->dest.ssa.bit_size == 64) {
6000             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
6001             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
6002             if (ctx->program->chip_class >= GFX8) {
6003                lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
6004                hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
6005             } else {
6006                lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
6007                hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
6008             }
6009             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6010             emit_split_vector(ctx, dst, 2);
6011          } else {
6012             fprintf(stderr, "Unimplemented NIR instr bit size: ");
6013             nir_print_instr(&instr->instr, stderr);
6014             fprintf(stderr, "\n");
6015          }
6016       }
6017       break;
6018    }
6019    case nir_intrinsic_quad_swap_horizontal:
6020    case nir_intrinsic_quad_swap_vertical:
6021    case nir_intrinsic_quad_swap_diagonal:
6022    case nir_intrinsic_quad_swizzle_amd: {
6023       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6024       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
6025          emit_uniform_subgroup(ctx, instr, src);
6026          break;
6027       }
6028       uint16_t dpp_ctrl = 0;
6029       switch (instr->intrinsic) {
6030       case nir_intrinsic_quad_swap_horizontal:
6031          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
6032          break;
6033       case nir_intrinsic_quad_swap_vertical:
6034          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
6035          break;
6036       case nir_intrinsic_quad_swap_diagonal:
6037          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
6038          break;
6039       case nir_intrinsic_quad_swizzle_amd:
6040          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
6041          break;
6042       default:
6043          break;
6044       }
6045       if (ctx->program->chip_class < GFX8)
6046          dpp_ctrl |= (1 << 15);
6047
6048       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6049       if (instr->dest.ssa.bit_size == 1) {
6050          assert(src.regClass() == bld.lm);
6051          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
6052          if (ctx->program->chip_class >= GFX8)
6053             src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
6054          else
6055             src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
6056          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
6057          emit_wqm(ctx, tmp, dst);
6058       } else if (instr->dest.ssa.bit_size == 32) {
6059          Temp tmp;
6060          if (ctx->program->chip_class >= GFX8)
6061             tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
6062          else
6063             tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
6064          emit_wqm(ctx, tmp, dst);
6065       } else if (instr->dest.ssa.bit_size == 64) {
6066          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
6067          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
6068          if (ctx->program->chip_class >= GFX8) {
6069             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
6070             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
6071          } else {
6072             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
6073             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
6074          }
6075          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6076          emit_split_vector(ctx, dst, 2);
6077       } else {
6078          fprintf(stderr, "Unimplemented NIR instr bit size: ");
6079          nir_print_instr(&instr->instr, stderr);
6080          fprintf(stderr, "\n");
6081       }
6082       break;
6083    }
6084    case nir_intrinsic_masked_swizzle_amd: {
6085       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6086       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
6087          emit_uniform_subgroup(ctx, instr, src);
6088          break;
6089       }
6090       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6091       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
6092       if (dst.regClass() == v1) {
6093          emit_wqm(ctx,
6094                   bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
6095                   dst);
6096       } else if (dst.regClass() == v2) {
6097          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
6098          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
6099          lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
6100          hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
6101          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6102          emit_split_vector(ctx, dst, 2);
6103       } else {
6104          fprintf(stderr, "Unimplemented NIR instr bit size: ");
6105          nir_print_instr(&instr->instr, stderr);
6106          fprintf(stderr, "\n");
6107       }
6108       break;
6109    }
6110    case nir_intrinsic_write_invocation_amd: {
6111       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6112       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
6113       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
6114       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6115       if (dst.regClass() == v1) {
6116          /* src2 is ignored for writelane. RA assigns the same reg for dst */
6117          emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
6118       } else if (dst.regClass() == v2) {
6119          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
6120          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
6121          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
6122          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
6123          Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
6124          Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
6125          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
6126          emit_split_vector(ctx, dst, 2);
6127       } else {
6128          fprintf(stderr, "Unimplemented NIR instr bit size: ");
6129          nir_print_instr(&instr->instr, stderr);
6130          fprintf(stderr, "\n");
6131       }
6132       break;
6133    }
6134    case nir_intrinsic_mbcnt_amd: {
6135       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6136       RegClass rc = RegClass(src.type(), 1);
6137       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
6138       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
6139       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6140       Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi));
6141       emit_wqm(ctx, wqm_tmp, dst);
6142       break;
6143    }
6144    case nir_intrinsic_load_helper_invocation: {
6145       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6146       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
6147       ctx->block->kind |= block_kind_needs_lowering;
6148       ctx->program->needs_exact = true;
6149       break;
6150    }
6151    case nir_intrinsic_is_helper_invocation: {
6152       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6153       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
6154       ctx->block->kind |= block_kind_needs_lowering;
6155       ctx->program->needs_exact = true;
6156       break;
6157    }
6158    case nir_intrinsic_demote:
6159       bld.pseudo(aco_opcode::p_demote_to_helper);
6160       ctx->block->kind |= block_kind_uses_demote;
6161       ctx->program->needs_exact = true;
6162       break;
6163    case nir_intrinsic_demote_if: {
6164       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
6165       assert(src.regClass() == bld.lm);
6166       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
6167       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
6168       ctx->block->kind |= block_kind_uses_demote;
6169       ctx->program->needs_exact = true;
6170       break;
6171    }
6172    case nir_intrinsic_first_invocation: {
6173       emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
6174                get_ssa_temp(ctx, &instr->dest.ssa));
6175       break;
6176    }
6177    case nir_intrinsic_shader_clock:
6178       bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
6179       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
6180       break;
6181    case nir_intrinsic_load_vertex_id_zero_base: {
6182       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6183       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
6184       break;
6185    }
6186    case nir_intrinsic_load_first_vertex: {
6187       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6188       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
6189       break;
6190    }
6191    case nir_intrinsic_load_base_instance: {
6192       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6193       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
6194       break;
6195    }
6196    case nir_intrinsic_load_instance_id: {
6197       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6198       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
6199       break;
6200    }
6201    case nir_intrinsic_load_draw_id: {
6202       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6203       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
6204       break;
6205    }
6206    default:
6207       fprintf(stderr, "Unimplemented intrinsic instr: ");
6208       nir_print_instr(&instr->instr, stderr);
6209       fprintf(stderr, "\n");
6210       abort();
6211
6212       break;
6213    }
6214 }
6215
6216
6217 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
6218                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
6219                     enum glsl_base_type *stype)
6220 {
6221    nir_deref_instr *texture_deref_instr = NULL;
6222    nir_deref_instr *sampler_deref_instr = NULL;
6223    int plane = -1;
6224
6225    for (unsigned i = 0; i < instr->num_srcs; i++) {
6226       switch (instr->src[i].src_type) {
6227       case nir_tex_src_texture_deref:
6228          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
6229          break;
6230       case nir_tex_src_sampler_deref:
6231          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
6232          break;
6233       case nir_tex_src_plane:
6234          plane = nir_src_as_int(instr->src[i].src);
6235          break;
6236       default:
6237          break;
6238       }
6239    }
6240
6241    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
6242
6243    if (!sampler_deref_instr)
6244       sampler_deref_instr = texture_deref_instr;
6245
6246    if (plane >= 0) {
6247       assert(instr->op != nir_texop_txf_ms &&
6248              instr->op != nir_texop_samples_identical);
6249       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
6250       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
6251    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
6252       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
6253    } else {
6254       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
6255    }
6256    if (samp_ptr) {
6257       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
6258
6259       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
6260          /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
6261          Builder bld(ctx->program, ctx->block);
6262
6263          /* to avoid unnecessary moves, we split and recombine sampler and image */
6264          Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
6265                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
6266          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
6267          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
6268                     Definition(img[2]), Definition(img[3]), Definition(img[4]),
6269                     Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
6270          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
6271                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
6272
6273          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
6274          *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
6275                                img[0], img[1], img[2], img[3],
6276                                img[4], img[5], img[6], img[7]);
6277          *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6278                                 samp[0], samp[1], samp[2], samp[3]);
6279       }
6280    }
6281    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
6282                      instr->op == nir_texop_samples_identical))
6283       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
6284 }
6285
6286 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
6287                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
6288 {
6289    Builder bld(ctx->program, ctx->block);
6290
6291    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
6292    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
6293    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
6294
6295    Operand neg_one(0xbf800000u);
6296    Operand one(0x3f800000u);
6297    Operand two(0x40000000u);
6298    Operand four(0x40800000u);
6299
6300    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
6301    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
6302    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
6303
6304    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
6305    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
6306    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
6307    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
6308
6309    // select sc
6310    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
6311    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
6312                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
6313                        one, is_ma_y);
6314    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6315
6316    // select tc
6317    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
6318    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
6319    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6320
6321    // select ma
6322    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6323                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
6324                   deriv_z, is_ma_z);
6325    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
6326    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
6327 }
6328
6329 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
6330 {
6331    Builder bld(ctx->program, ctx->block);
6332    Temp coord_args[4], ma, tc, sc, id;
6333    for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
6334       coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
6335
6336    if (is_array) {
6337       coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
6338
6339       // see comment in ac_prepare_cube_coords()
6340       if (ctx->options->chip_class <= GFX8)
6341          coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
6342    }
6343
6344    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6345
6346    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
6347    vop3a->operands[0] = Operand(ma);
6348    vop3a->abs[0] = true;
6349    Temp invma = bld.tmp(v1);
6350    vop3a->definitions[0] = Definition(invma);
6351    ctx->block->instructions.emplace_back(std::move(vop3a));
6352
6353    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6354    if (!is_deriv)
6355       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
6356
6357    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6358    if (!is_deriv)
6359       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6360
6361    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6362
6363    if (is_deriv) {
6364       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6365       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6366
6367       for (unsigned i = 0; i < 2; i++) {
6368          // see comment in ac_prepare_cube_coords()
6369          Temp deriv_ma;
6370          Temp deriv_sc, deriv_tc;
6371          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6372                            &deriv_ma, &deriv_sc, &deriv_tc);
6373
6374          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6375
6376          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6377                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6378                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6379          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6380                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6381                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6382          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6383       }
6384
6385       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6386       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6387    }
6388
6389    if (is_array)
6390       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6391    *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6392
6393 }
6394
6395 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6396 {
6397    Temp coord_vec[3];
6398    for (unsigned i = 0; i < coords.size(); i++)
6399       coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6400
6401    Builder bld(ctx->program, ctx->block);
6402    coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6403
6404    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6405    for (unsigned i = 0; i < coords.size(); i++)
6406       vec->operands[i] = Operand(coord_vec[i]);
6407    Temp res = bld.tmp(RegType::vgpr, coords.size());
6408    vec->definitions[0] = Definition(res);
6409    ctx->block->instructions.emplace_back(std::move(vec));
6410    return res;
6411 }
6412
6413 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6414 {
6415    if (vec->parent_instr->type != nir_instr_type_alu)
6416       return;
6417    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6418    if (vec_instr->op != nir_op_vec(vec->num_components))
6419       return;
6420
6421    for (unsigned i = 0; i < vec->num_components; i++) {
6422       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6423               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6424    }
6425 }
6426
6427 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6428 {
6429    Builder bld(ctx->program, ctx->block);
6430    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6431         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6432    Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6433         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6434    nir_const_value *sample_index_cv = NULL;
6435    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6436    enum glsl_base_type stype;
6437    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6438
6439    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6440                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6441    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6442                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6443
6444    for (unsigned i = 0; i < instr->num_srcs; i++) {
6445       switch (instr->src[i].src_type) {
6446       case nir_tex_src_coord:
6447          coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6448          break;
6449       case nir_tex_src_bias:
6450          if (instr->op == nir_texop_txb) {
6451             bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6452             has_bias = true;
6453          }
6454          break;
6455       case nir_tex_src_lod: {
6456          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6457
6458          if (val && val->f32 <= 0.0) {
6459             level_zero = true;
6460          } else {
6461             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6462             has_lod = true;
6463          }
6464          break;
6465       }
6466       case nir_tex_src_comparator:
6467          if (instr->is_shadow) {
6468             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6469             has_compare = true;
6470          }
6471          break;
6472       case nir_tex_src_offset:
6473          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6474          get_const_vec(instr->src[i].src.ssa, const_offset);
6475          has_offset = true;
6476          break;
6477       case nir_tex_src_ddx:
6478          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6479          has_ddx = true;
6480          break;
6481       case nir_tex_src_ddy:
6482          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6483          has_ddy = true;
6484          break;
6485       case nir_tex_src_ms_index:
6486          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6487          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6488          has_sample_index = true;
6489          break;
6490       case nir_tex_src_texture_offset:
6491       case nir_tex_src_sampler_offset:
6492       default:
6493          break;
6494       }
6495    }
6496 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6497    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6498       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6499
6500    if (instr->op == nir_texop_texture_samples) {
6501       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6502
6503       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6504       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6505       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6506       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6507
6508       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6509                samples, Operand(1u), bld.scc(is_msaa));
6510       return;
6511    }
6512
6513    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6514       aco_ptr<Instruction> tmp_instr;
6515       Temp acc, pack = Temp();
6516
6517       uint32_t pack_const = 0;
6518       for (unsigned i = 0; i < offset.size(); i++) {
6519          if (!const_offset[i])
6520             continue;
6521          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6522       }
6523
6524       if (offset.type() == RegType::sgpr) {
6525          for (unsigned i = 0; i < offset.size(); i++) {
6526             if (const_offset[i])
6527                continue;
6528
6529             acc = emit_extract_vector(ctx, offset, i, s1);
6530             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6531
6532             if (i) {
6533                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6534             }
6535
6536             if (pack == Temp()) {
6537                pack = acc;
6538             } else {
6539                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6540             }
6541          }
6542
6543          if (pack_const && pack != Temp())
6544             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6545       } else {
6546          for (unsigned i = 0; i < offset.size(); i++) {
6547             if (const_offset[i])
6548                continue;
6549
6550             acc = emit_extract_vector(ctx, offset, i, v1);
6551             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6552
6553             if (i) {
6554                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6555             }
6556
6557             if (pack == Temp()) {
6558                pack = acc;
6559             } else {
6560                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6561             }
6562          }
6563
6564          if (pack_const && pack != Temp())
6565             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6566       }
6567       if (pack_const && pack == Temp())
6568          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6569       else if (pack == Temp())
6570          has_offset = false;
6571       else
6572          offset = pack;
6573    }
6574
6575    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6576       prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6577
6578    /* pack derivatives */
6579    if (has_ddx || has_ddy) {
6580       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6581          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6582                              ddx, Operand(0u), ddy, Operand(0u));
6583       } else {
6584          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6585       }
6586       has_derivs = true;
6587    }
6588
6589    if (instr->coord_components > 1 &&
6590        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6591        instr->is_array &&
6592        instr->op != nir_texop_txf)
6593       coords = apply_round_slice(ctx, coords, 1);
6594
6595    if (instr->coord_components > 2 &&
6596       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6597        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6598        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6599        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6600        instr->is_array &&
6601        instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6602       coords = apply_round_slice(ctx, coords, 2);
6603
6604    if (ctx->options->chip_class == GFX9 &&
6605        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6606        instr->op != nir_texop_lod && instr->coord_components) {
6607       assert(coords.size() > 0 && coords.size() < 3);
6608
6609       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6610       vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6611       vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6612       if (coords.size() > 1)
6613          vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6614       coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6615       vec->definitions[0] = Definition(coords);
6616       ctx->block->instructions.emplace_back(std::move(vec));
6617    }
6618
6619    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6620
6621    if (instr->op == nir_texop_samples_identical)
6622       resource = fmask_ptr;
6623
6624    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6625              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6626             instr->op != nir_texop_txs) {
6627       assert(has_sample_index);
6628       Operand op(sample_index);
6629       if (sample_index_cv)
6630          op = Operand(sample_index_cv->u32);
6631       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6632    }
6633
6634    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6635       Temp split_coords[coords.size()];
6636       emit_split_vector(ctx, coords, coords.size());
6637       for (unsigned i = 0; i < coords.size(); i++)
6638          split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6639
6640       unsigned i = 0;
6641       for (; i < std::min(offset.size(), instr->coord_components); i++) {
6642          Temp off = emit_extract_vector(ctx, offset, i, v1);
6643          split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6644       }
6645
6646       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6647       for (unsigned i = 0; i < coords.size(); i++)
6648          vec->operands[i] = Operand(split_coords[i]);
6649       coords = bld.tmp(coords.regClass());
6650       vec->definitions[0] = Definition(coords);
6651       ctx->block->instructions.emplace_back(std::move(vec));
6652
6653       has_offset = false;
6654    }
6655
6656    /* Build tex instruction */
6657    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6658    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6659                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6660                   : 0;
6661    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6662    Temp tmp_dst = dst;
6663
6664    /* gather4 selects the component by dmask and always returns vec4 */
6665    if (instr->op == nir_texop_tg4) {
6666       assert(instr->dest.ssa.num_components == 4);
6667       if (instr->is_shadow)
6668          dmask = 1;
6669       else
6670          dmask = 1 << instr->component;
6671       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6672          tmp_dst = bld.tmp(v4);
6673    } else if (instr->op == nir_texop_samples_identical) {
6674       tmp_dst = bld.tmp(v1);
6675    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6676       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6677    }
6678
6679    aco_ptr<MIMG_instruction> tex;
6680    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6681       if (!has_lod)
6682          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6683
6684       bool div_by_6 = instr->op == nir_texop_txs &&
6685                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6686                       instr->is_array &&
6687                       (dmask & (1 << 2));
6688       if (tmp_dst.id() == dst.id() && div_by_6)
6689          tmp_dst = bld.tmp(tmp_dst.regClass());
6690
6691       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6692       tex->operands[0] = Operand(as_vgpr(ctx,lod));
6693       tex->operands[1] = Operand(resource);
6694       if (ctx->options->chip_class == GFX9 &&
6695           instr->op == nir_texop_txs &&
6696           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6697           instr->is_array) {
6698          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6699       } else if (instr->op == nir_texop_query_levels) {
6700          tex->dmask = 1 << 3;
6701       } else {
6702          tex->dmask = dmask;
6703       }
6704       tex->da = da;
6705       tex->definitions[0] = Definition(tmp_dst);
6706       tex->dim = dim;
6707       tex->can_reorder = true;
6708       ctx->block->instructions.emplace_back(std::move(tex));
6709
6710       if (div_by_6) {
6711          /* divide 3rd value by 6 by multiplying with magic number */
6712          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6713          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6714          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6715          assert(instr->dest.ssa.num_components == 3);
6716          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6717          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6718                               emit_extract_vector(ctx, tmp_dst, 0, v1),
6719                               emit_extract_vector(ctx, tmp_dst, 1, v1),
6720                               by_6);
6721
6722       }
6723
6724       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6725       return;
6726    }
6727
6728    Temp tg4_compare_cube_wa64 = Temp();
6729
6730    if (tg4_integer_workarounds) {
6731       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6732       tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6733       tex->operands[1] = Operand(resource);
6734       tex->dim = dim;
6735       tex->dmask = 0x3;
6736       tex->da = da;
6737       Temp size = bld.tmp(v2);
6738       tex->definitions[0] = Definition(size);
6739       tex->can_reorder = true;
6740       ctx->block->instructions.emplace_back(std::move(tex));
6741       emit_split_vector(ctx, size, size.size());
6742
6743       Temp half_texel[2];
6744       for (unsigned i = 0; i < 2; i++) {
6745          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6746          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6747          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6748          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6749       }
6750
6751       Temp orig_coords[2] = {
6752          emit_extract_vector(ctx, coords, 0, v1),
6753          emit_extract_vector(ctx, coords, 1, v1)};
6754       Temp new_coords[2] = {
6755          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6756          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6757       };
6758
6759       if (tg4_integer_cube_workaround) {
6760          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6761          Temp desc[resource.size()];
6762          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6763                                                                            Format::PSEUDO, 1, resource.size())};
6764          split->operands[0] = Operand(resource);
6765          for (unsigned i = 0; i < resource.size(); i++) {
6766             desc[i] = bld.tmp(s1);
6767             split->definitions[i] = Definition(desc[i]);
6768          }
6769          ctx->block->instructions.emplace_back(std::move(split));
6770
6771          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6772          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6773                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6774
6775          Temp nfmt;
6776          if (stype == GLSL_TYPE_UINT) {
6777             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6778                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6779                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6780                             bld.scc(compare_cube_wa));
6781          } else {
6782             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6783                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6784                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6785                             bld.scc(compare_cube_wa));
6786          }
6787          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
6788          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
6789
6790          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6791
6792          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6793                             Operand((uint32_t)C_008F14_NUM_FORMAT));
6794          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6795
6796          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6797                                                                          Format::PSEUDO, resource.size(), 1)};
6798          for (unsigned i = 0; i < resource.size(); i++)
6799             vec->operands[i] = Operand(desc[i]);
6800          resource = bld.tmp(resource.regClass());
6801          vec->definitions[0] = Definition(resource);
6802          ctx->block->instructions.emplace_back(std::move(vec));
6803
6804          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6805                                   new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6806          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6807                                   new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6808       }
6809
6810       if (coords.size() == 3) {
6811          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6812                              new_coords[0], new_coords[1],
6813                              emit_extract_vector(ctx, coords, 2, v1));
6814       } else {
6815          assert(coords.size() == 2);
6816          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6817                              new_coords[0], new_coords[1]);
6818       }
6819    }
6820
6821    std::vector<Operand> args;
6822    if (has_offset)
6823       args.emplace_back(Operand(offset));
6824    if (has_bias)
6825       args.emplace_back(Operand(bias));
6826    if (has_compare)
6827       args.emplace_back(Operand(compare));
6828    if (has_derivs)
6829       args.emplace_back(Operand(derivs));
6830    args.emplace_back(Operand(coords));
6831    if (has_sample_index)
6832       args.emplace_back(Operand(sample_index));
6833    if (has_lod)
6834       args.emplace_back(lod);
6835
6836    Temp arg;
6837    if (args.size() > 1) {
6838       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6839       unsigned size = 0;
6840       for (unsigned i = 0; i < args.size(); i++) {
6841          size += args[i].size();
6842          vec->operands[i] = args[i];
6843       }
6844       RegClass rc = RegClass(RegType::vgpr, size);
6845       Temp tmp = bld.tmp(rc);
6846       vec->definitions[0] = Definition(tmp);
6847       ctx->block->instructions.emplace_back(std::move(vec));
6848       arg = tmp;
6849    } else {
6850       assert(args[0].isTemp());
6851       arg = as_vgpr(ctx, args[0].getTemp());
6852    }
6853
6854    /* we don't need the bias, sample index, compare value or offset to be
6855     * computed in WQM but if the p_create_vector copies the coordinates, then it
6856     * needs to be in WQM */
6857    if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6858        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6859        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6860       arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
6861
6862    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6863       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6864
6865       assert(coords.size() == 1);
6866       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6867       aco_opcode op;
6868       switch (last_bit) {
6869       case 1:
6870          op = aco_opcode::buffer_load_format_x; break;
6871       case 2:
6872          op = aco_opcode::buffer_load_format_xy; break;
6873       case 3:
6874          op = aco_opcode::buffer_load_format_xyz; break;
6875       case 4:
6876          op = aco_opcode::buffer_load_format_xyzw; break;
6877       default:
6878          unreachable("Tex instruction loads more than 4 components.");
6879       }
6880
6881       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6882       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6883          tmp_dst = dst;
6884       else
6885          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6886
6887       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6888       mubuf->operands[0] = Operand(coords);
6889       mubuf->operands[1] = Operand(resource);
6890       mubuf->operands[2] = Operand((uint32_t) 0);
6891       mubuf->definitions[0] = Definition(tmp_dst);
6892       mubuf->idxen = true;
6893       mubuf->can_reorder = true;
6894       ctx->block->instructions.emplace_back(std::move(mubuf));
6895
6896       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6897       return;
6898    }
6899
6900
6901    if (instr->op == nir_texop_txf ||
6902        instr->op == nir_texop_txf_ms ||
6903        instr->op == nir_texop_samples_identical) {
6904       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6905       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6906       tex->operands[0] = Operand(arg);
6907       tex->operands[1] = Operand(resource);
6908       tex->dim = dim;
6909       tex->dmask = dmask;
6910       tex->unrm = true;
6911       tex->da = da;
6912       tex->definitions[0] = Definition(tmp_dst);
6913       tex->can_reorder = true;
6914       ctx->block->instructions.emplace_back(std::move(tex));
6915
6916       if (instr->op == nir_texop_samples_identical) {
6917          assert(dmask == 1 && dst.regClass() == v1);
6918          assert(dst.id() != tmp_dst.id());
6919
6920          Temp tmp = bld.tmp(bld.lm);
6921          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6922          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6923
6924       } else {
6925          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6926       }
6927       return;
6928    }
6929
6930    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6931    aco_opcode opcode = aco_opcode::image_sample;
6932    if (has_offset) { /* image_sample_*_o */
6933       if (has_compare) {
6934          opcode = aco_opcode::image_sample_c_o;
6935          if (has_derivs)
6936             opcode = aco_opcode::image_sample_c_d_o;
6937          if (has_bias)
6938             opcode = aco_opcode::image_sample_c_b_o;
6939          if (level_zero)
6940             opcode = aco_opcode::image_sample_c_lz_o;
6941          if (has_lod)
6942             opcode = aco_opcode::image_sample_c_l_o;
6943       } else {
6944          opcode = aco_opcode::image_sample_o;
6945          if (has_derivs)
6946             opcode = aco_opcode::image_sample_d_o;
6947          if (has_bias)
6948             opcode = aco_opcode::image_sample_b_o;
6949          if (level_zero)
6950             opcode = aco_opcode::image_sample_lz_o;
6951          if (has_lod)
6952             opcode = aco_opcode::image_sample_l_o;
6953       }
6954    } else { /* no offset */
6955       if (has_compare) {
6956          opcode = aco_opcode::image_sample_c;
6957          if (has_derivs)
6958             opcode = aco_opcode::image_sample_c_d;
6959          if (has_bias)
6960             opcode = aco_opcode::image_sample_c_b;
6961          if (level_zero)
6962             opcode = aco_opcode::image_sample_c_lz;
6963          if (has_lod)
6964             opcode = aco_opcode::image_sample_c_l;
6965       } else {
6966          opcode = aco_opcode::image_sample;
6967          if (has_derivs)
6968             opcode = aco_opcode::image_sample_d;
6969          if (has_bias)
6970             opcode = aco_opcode::image_sample_b;
6971          if (level_zero)
6972             opcode = aco_opcode::image_sample_lz;
6973          if (has_lod)
6974             opcode = aco_opcode::image_sample_l;
6975       }
6976    }
6977
6978    if (instr->op == nir_texop_tg4) {
6979       if (has_offset) {
6980          opcode = aco_opcode::image_gather4_lz_o;
6981          if (has_compare)
6982             opcode = aco_opcode::image_gather4_c_lz_o;
6983       } else {
6984          opcode = aco_opcode::image_gather4_lz;
6985          if (has_compare)
6986             opcode = aco_opcode::image_gather4_c_lz;
6987       }
6988    } else if (instr->op == nir_texop_lod) {
6989       opcode = aco_opcode::image_get_lod;
6990    }
6991
6992    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6993    tex->operands[0] = Operand(arg);
6994    tex->operands[1] = Operand(resource);
6995    tex->operands[2] = Operand(sampler);
6996    tex->dim = dim;
6997    tex->dmask = dmask;
6998    tex->da = da;
6999    tex->definitions[0] = Definition(tmp_dst);
7000    tex->can_reorder = true;
7001    ctx->block->instructions.emplace_back(std::move(tex));
7002
7003    if (tg4_integer_cube_workaround) {
7004       assert(tmp_dst.id() != dst.id());
7005       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
7006
7007       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
7008       Temp val[4];
7009       for (unsigned i = 0; i < dst.size(); i++) {
7010          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
7011          Temp cvt_val;
7012          if (stype == GLSL_TYPE_UINT)
7013             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
7014          else
7015             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
7016          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
7017       }
7018       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
7019       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
7020                            val[0], val[1], val[2], val[3]);
7021    }
7022    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
7023    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
7024
7025 }
7026
7027
7028 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
7029 {
7030    Temp tmp = get_ssa_temp(ctx, ssa);
7031    if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
7032       return Operand(tmp.regClass());
7033    else
7034       return Operand(tmp);
7035 }
7036
7037 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
7038 {
7039    aco_ptr<Pseudo_instruction> phi;
7040    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7041    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
7042
7043    bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
7044    logical |= ctx->block->kind & block_kind_merge;
7045    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
7046
7047    /* we want a sorted list of sources, since the predecessor list is also sorted */
7048    std::map<unsigned, nir_ssa_def*> phi_src;
7049    nir_foreach_phi_src(src, instr)
7050       phi_src[src->pred->index] = src->src.ssa;
7051
7052    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
7053    unsigned num_operands = 0;
7054    Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size())];
7055    unsigned num_defined = 0;
7056    unsigned cur_pred_idx = 0;
7057    for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
7058       if (cur_pred_idx < preds.size()) {
7059          /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
7060          unsigned block = ctx->cf_info.nir_to_aco[src.first];
7061          unsigned skipped = 0;
7062          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
7063             skipped++;
7064          if (cur_pred_idx + skipped < preds.size()) {
7065             for (unsigned i = 0; i < skipped; i++)
7066                operands[num_operands++] = Operand(dst.regClass());
7067             cur_pred_idx += skipped;
7068          } else {
7069             continue;
7070          }
7071       }
7072       cur_pred_idx++;
7073       Operand op = get_phi_operand(ctx, src.second);
7074       operands[num_operands++] = op;
7075       num_defined += !op.isUndefined();
7076    }
7077    /* handle block_kind_continue_or_break at loop exit blocks */
7078    while (cur_pred_idx++ < preds.size())
7079       operands[num_operands++] = Operand(dst.regClass());
7080
7081    if (num_defined == 0) {
7082       Builder bld(ctx->program, ctx->block);
7083       if (dst.regClass() == s1) {
7084          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
7085       } else if (dst.regClass() == v1) {
7086          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
7087       } else {
7088          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
7089          for (unsigned i = 0; i < dst.size(); i++)
7090             vec->operands[i] = Operand(0u);
7091          vec->definitions[0] = Definition(dst);
7092          ctx->block->instructions.emplace_back(std::move(vec));
7093       }
7094       return;
7095    }
7096
7097    /* we can use a linear phi in some cases if one src is undef */
7098    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
7099       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
7100
7101       Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
7102       Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
7103       assert(invert->kind & block_kind_invert);
7104
7105       unsigned then_block = invert->linear_preds[0];
7106
7107       Block* insert_block = NULL;
7108       for (unsigned i = 0; i < num_operands; i++) {
7109          Operand op = operands[i];
7110          if (op.isUndefined())
7111             continue;
7112          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
7113          phi->operands[0] = op;
7114          break;
7115       }
7116       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
7117       phi->operands[1] = Operand(dst.regClass());
7118       phi->definitions[0] = Definition(dst);
7119       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
7120       return;
7121    }
7122
7123    /* try to scalarize vector phis */
7124    if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
7125       // TODO: scalarize linear phis on divergent ifs
7126       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
7127       std::array<Temp, 4> new_vec;
7128       for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
7129          Operand src = operands[i];
7130          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
7131             can_scalarize = false;
7132       }
7133       if (can_scalarize) {
7134          unsigned num_components = instr->dest.ssa.num_components;
7135          assert(dst.size() % num_components == 0);
7136          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
7137
7138          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
7139          for (unsigned k = 0; k < num_components; k++) {
7140             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
7141             for (unsigned i = 0; i < num_operands; i++) {
7142                Operand src = operands[i];
7143                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
7144             }
7145             Temp phi_dst = {ctx->program->allocateId(), rc};
7146             phi->definitions[0] = Definition(phi_dst);
7147             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
7148             new_vec[k] = phi_dst;
7149             vec->operands[k] = Operand(phi_dst);
7150          }
7151          vec->definitions[0] = Definition(dst);
7152          ctx->block->instructions.emplace_back(std::move(vec));
7153          ctx->allocated_vec.emplace(dst.id(), new_vec);
7154          return;
7155       }
7156    }
7157
7158    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
7159    for (unsigned i = 0; i < num_operands; i++)
7160       phi->operands[i] = operands[i];
7161    phi->definitions[0] = Definition(dst);
7162    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
7163 }
7164
7165
7166 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
7167 {
7168    Temp dst = get_ssa_temp(ctx, &instr->def);
7169
7170    assert(dst.type() == RegType::sgpr);
7171
7172    if (dst.size() == 1) {
7173       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
7174    } else {
7175       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
7176       for (unsigned i = 0; i < dst.size(); i++)
7177          vec->operands[i] = Operand(0u);
7178       vec->definitions[0] = Definition(dst);
7179       ctx->block->instructions.emplace_back(std::move(vec));
7180    }
7181 }
7182
7183 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
7184 {
7185    Builder bld(ctx->program, ctx->block);
7186    Block *logical_target;
7187    append_logical_end(ctx->block);
7188    unsigned idx = ctx->block->index;
7189
7190    switch (instr->type) {
7191    case nir_jump_break:
7192       logical_target = ctx->cf_info.parent_loop.exit;
7193       add_logical_edge(idx, logical_target);
7194       ctx->block->kind |= block_kind_break;
7195
7196       if (!ctx->cf_info.parent_if.is_divergent &&
7197           !ctx->cf_info.parent_loop.has_divergent_continue) {
7198          /* uniform break - directly jump out of the loop */
7199          ctx->block->kind |= block_kind_uniform;
7200          ctx->cf_info.has_branch = true;
7201          bld.branch(aco_opcode::p_branch);
7202          add_linear_edge(idx, logical_target);
7203          return;
7204       }
7205       ctx->cf_info.parent_loop.has_divergent_branch = true;
7206       ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
7207       break;
7208    case nir_jump_continue:
7209       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
7210       add_logical_edge(idx, logical_target);
7211       ctx->block->kind |= block_kind_continue;
7212
7213       if (ctx->cf_info.parent_if.is_divergent) {
7214          /* for potential uniform breaks after this continue,
7215             we must ensure that they are handled correctly */
7216          ctx->cf_info.parent_loop.has_divergent_continue = true;
7217          ctx->cf_info.parent_loop.has_divergent_branch = true;
7218          ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
7219       } else {
7220          /* uniform continue - directly jump to the loop header */
7221          ctx->block->kind |= block_kind_uniform;
7222          ctx->cf_info.has_branch = true;
7223          bld.branch(aco_opcode::p_branch);
7224          add_linear_edge(idx, logical_target);
7225          return;
7226       }
7227       break;
7228    default:
7229       fprintf(stderr, "Unknown NIR jump instr: ");
7230       nir_print_instr(&instr->instr, stderr);
7231       fprintf(stderr, "\n");
7232       abort();
7233    }
7234
7235    /* remove critical edges from linear CFG */
7236    bld.branch(aco_opcode::p_branch);
7237    Block* break_block = ctx->program->create_and_insert_block();
7238    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7239    break_block->kind |= block_kind_uniform;
7240    add_linear_edge(idx, break_block);
7241    /* the loop_header pointer might be invalidated by this point */
7242    if (instr->type == nir_jump_continue)
7243       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
7244    add_linear_edge(break_block->index, logical_target);
7245    bld.reset(break_block);
7246    bld.branch(aco_opcode::p_branch);
7247
7248    Block* continue_block = ctx->program->create_and_insert_block();
7249    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7250    add_linear_edge(idx, continue_block);
7251    append_logical_start(continue_block);
7252    ctx->block = continue_block;
7253    return;
7254 }
7255
7256 void visit_block(isel_context *ctx, nir_block *block)
7257 {
7258    nir_foreach_instr(instr, block) {
7259       switch (instr->type) {
7260       case nir_instr_type_alu:
7261          visit_alu_instr(ctx, nir_instr_as_alu(instr));
7262          break;
7263       case nir_instr_type_load_const:
7264          visit_load_const(ctx, nir_instr_as_load_const(instr));
7265          break;
7266       case nir_instr_type_intrinsic:
7267          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
7268          break;
7269       case nir_instr_type_tex:
7270          visit_tex(ctx, nir_instr_as_tex(instr));
7271          break;
7272       case nir_instr_type_phi:
7273          visit_phi(ctx, nir_instr_as_phi(instr));
7274          break;
7275       case nir_instr_type_ssa_undef:
7276          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
7277          break;
7278       case nir_instr_type_deref:
7279          break;
7280       case nir_instr_type_jump:
7281          visit_jump(ctx, nir_instr_as_jump(instr));
7282          break;
7283       default:
7284          fprintf(stderr, "Unknown NIR instr type: ");
7285          nir_print_instr(instr, stderr);
7286          fprintf(stderr, "\n");
7287          //abort();
7288       }
7289    }
7290
7291    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7292       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
7293 }
7294
7295
7296
7297 static void visit_loop(isel_context *ctx, nir_loop *loop)
7298 {
7299    append_logical_end(ctx->block);
7300    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
7301    Builder bld(ctx->program, ctx->block);
7302    bld.branch(aco_opcode::p_branch);
7303    unsigned loop_preheader_idx = ctx->block->index;
7304
7305    Block loop_exit = Block();
7306    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7307    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
7308
7309    Block* loop_header = ctx->program->create_and_insert_block();
7310    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
7311    loop_header->kind |= block_kind_loop_header;
7312    add_edge(loop_preheader_idx, loop_header);
7313    ctx->block = loop_header;
7314
7315    /* emit loop body */
7316    unsigned loop_header_idx = loop_header->index;
7317    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
7318    append_logical_start(ctx->block);
7319    visit_cf_list(ctx, &loop->body);
7320
7321    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
7322    if (!ctx->cf_info.has_branch) {
7323       append_logical_end(ctx->block);
7324       if (ctx->cf_info.exec_potentially_empty) {
7325          /* Discards can result in code running with an empty exec mask.
7326           * This would result in divergent breaks not ever being taken. As a
7327           * workaround, break the loop when the loop mask is empty instead of
7328           * always continuing. */
7329          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
7330          unsigned block_idx = ctx->block->index;
7331
7332          /* create helper blocks to avoid critical edges */
7333          Block *break_block = ctx->program->create_and_insert_block();
7334          break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7335          break_block->kind = block_kind_uniform;
7336          bld.reset(break_block);
7337          bld.branch(aco_opcode::p_branch);
7338          add_linear_edge(block_idx, break_block);
7339          add_linear_edge(break_block->index, &loop_exit);
7340
7341          Block *continue_block = ctx->program->create_and_insert_block();
7342          continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7343          continue_block->kind = block_kind_uniform;
7344          bld.reset(continue_block);
7345          bld.branch(aco_opcode::p_branch);
7346          add_linear_edge(block_idx, continue_block);
7347          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
7348
7349          add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
7350          ctx->block = &ctx->program->blocks[block_idx];
7351       } else {
7352          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
7353          if (!ctx->cf_info.parent_loop.has_divergent_branch)
7354             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7355          else
7356             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7357       }
7358
7359       bld.reset(ctx->block);
7360       bld.branch(aco_opcode::p_branch);
7361    }
7362
7363    /* fixup phis in loop header from unreachable blocks */
7364    if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
7365       bool linear = ctx->cf_info.has_branch;
7366       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
7367       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
7368          if ((logical && instr->opcode == aco_opcode::p_phi) ||
7369              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
7370             /* the last operand should be the one that needs to be removed */
7371             instr->operands.pop_back();
7372          } else if (!is_phi(instr)) {
7373             break;
7374          }
7375       }
7376    }
7377
7378    ctx->cf_info.has_branch = false;
7379
7380    // TODO: if the loop has not a single exit, we must add one °°
7381    /* emit loop successor block */
7382    ctx->block = ctx->program->insert_block(std::move(loop_exit));
7383    append_logical_start(ctx->block);
7384
7385    #if 0
7386    // TODO: check if it is beneficial to not branch on continues
7387    /* trim linear phis in loop header */
7388    for (auto&& instr : loop_entry->instructions) {
7389       if (instr->opcode == aco_opcode::p_linear_phi) {
7390          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
7391          new_phi->definitions[0] = instr->definitions[0];
7392          for (unsigned i = 0; i < new_phi->operands.size(); i++)
7393             new_phi->operands[i] = instr->operands[i];
7394          /* check that the remaining operands are all the same */
7395          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
7396             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
7397          instr.swap(new_phi);
7398       } else if (instr->opcode == aco_opcode::p_phi) {
7399          continue;
7400       } else {
7401          break;
7402       }
7403    }
7404    #endif
7405 }
7406
7407 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7408 {
7409    ic->cond = cond;
7410
7411    append_logical_end(ctx->block);
7412    ctx->block->kind |= block_kind_branch;
7413
7414    /* branch to linear then block */
7415    assert(cond.regClass() == ctx->program->lane_mask);
7416    aco_ptr<Pseudo_branch_instruction> branch;
7417    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7418    branch->operands[0] = Operand(cond);
7419    ctx->block->instructions.push_back(std::move(branch));
7420
7421    ic->BB_if_idx = ctx->block->index;
7422    ic->BB_invert = Block();
7423    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7424    /* Invert blocks are intentionally not marked as top level because they
7425     * are not part of the logical cfg. */
7426    ic->BB_invert.kind |= block_kind_invert;
7427    ic->BB_endif = Block();
7428    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7429    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7430
7431    ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7432    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7433    ctx->cf_info.parent_if.is_divergent = true;
7434    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7435
7436    /** emit logical then block */
7437    Block* BB_then_logical = ctx->program->create_and_insert_block();
7438    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7439    add_edge(ic->BB_if_idx, BB_then_logical);
7440    ctx->block = BB_then_logical;
7441    append_logical_start(BB_then_logical);
7442 }
7443
7444 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7445 {
7446    Block *BB_then_logical = ctx->block;
7447    append_logical_end(BB_then_logical);
7448     /* branch from logical then block to invert block */
7449    aco_ptr<Pseudo_branch_instruction> branch;
7450    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7451    BB_then_logical->instructions.emplace_back(std::move(branch));
7452    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7453    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7454       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7455    BB_then_logical->kind |= block_kind_uniform;
7456    assert(!ctx->cf_info.has_branch);
7457    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7458    ctx->cf_info.parent_loop.has_divergent_branch = false;
7459
7460    /** emit linear then block */
7461    Block* BB_then_linear = ctx->program->create_and_insert_block();
7462    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7463    BB_then_linear->kind |= block_kind_uniform;
7464    add_linear_edge(ic->BB_if_idx, BB_then_linear);
7465    /* branch from linear then block to invert block */
7466    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7467    BB_then_linear->instructions.emplace_back(std::move(branch));
7468    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7469
7470    /** emit invert merge block */
7471    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7472    ic->invert_idx = ctx->block->index;
7473
7474    /* branch to linear else block (skip else) */
7475    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7476    branch->operands[0] = Operand(ic->cond);
7477    ctx->block->instructions.push_back(std::move(branch));
7478
7479    ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7480    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7481
7482    /** emit logical else block */
7483    Block* BB_else_logical = ctx->program->create_and_insert_block();
7484    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7485    add_logical_edge(ic->BB_if_idx, BB_else_logical);
7486    add_linear_edge(ic->invert_idx, BB_else_logical);
7487    ctx->block = BB_else_logical;
7488    append_logical_start(BB_else_logical);
7489 }
7490
7491 static void end_divergent_if(isel_context *ctx, if_context *ic)
7492 {
7493    Block *BB_else_logical = ctx->block;
7494    append_logical_end(BB_else_logical);
7495
7496    /* branch from logical else block to endif block */
7497    aco_ptr<Pseudo_branch_instruction> branch;
7498    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7499    BB_else_logical->instructions.emplace_back(std::move(branch));
7500    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7501    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7502       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7503    BB_else_logical->kind |= block_kind_uniform;
7504
7505    assert(!ctx->cf_info.has_branch);
7506    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7507
7508
7509    /** emit linear else block */
7510    Block* BB_else_linear = ctx->program->create_and_insert_block();
7511    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7512    BB_else_linear->kind |= block_kind_uniform;
7513    add_linear_edge(ic->invert_idx, BB_else_linear);
7514
7515    /* branch from linear else block to endif block */
7516    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7517    BB_else_linear->instructions.emplace_back(std::move(branch));
7518    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7519
7520
7521    /** emit endif merge block */
7522    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7523    append_logical_start(ctx->block);
7524
7525
7526    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7527    ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7528    /* uniform control flow never has an empty exec-mask */
7529    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7530       ctx->cf_info.exec_potentially_empty = false;
7531 }
7532
7533 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7534 {
7535    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7536    Builder bld(ctx->program, ctx->block);
7537    aco_ptr<Pseudo_branch_instruction> branch;
7538
7539    if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7540       /**
7541        * Uniform conditionals are represented in the following way*) :
7542        *
7543        * The linear and logical CFG:
7544        *                        BB_IF
7545        *                        /    \
7546        *       BB_THEN (logical)      BB_ELSE (logical)
7547        *                        \    /
7548        *                        BB_ENDIF
7549        *
7550        * *) Exceptions may be due to break and continue statements within loops
7551        *    If a break/continue happens within uniform control flow, it branches
7552        *    to the loop exit/entry block. Otherwise, it branches to the next
7553        *    merge block.
7554        **/
7555       append_logical_end(ctx->block);
7556       ctx->block->kind |= block_kind_uniform;
7557
7558       /* emit branch */
7559       assert(cond.regClass() == bld.lm);
7560       // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7561       cond = bool_to_scalar_condition(ctx, cond);
7562
7563       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7564       branch->operands[0] = Operand(cond);
7565       branch->operands[0].setFixed(scc);
7566       ctx->block->instructions.emplace_back(std::move(branch));
7567
7568       unsigned BB_if_idx = ctx->block->index;
7569       Block BB_endif = Block();
7570       BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7571       BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7572
7573       /** emit then block */
7574       Block* BB_then = ctx->program->create_and_insert_block();
7575       BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7576       add_edge(BB_if_idx, BB_then);
7577       append_logical_start(BB_then);
7578       ctx->block = BB_then;
7579       visit_cf_list(ctx, &if_stmt->then_list);
7580       BB_then = ctx->block;
7581       bool then_branch = ctx->cf_info.has_branch;
7582       bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7583
7584       if (!then_branch) {
7585          append_logical_end(BB_then);
7586          /* branch from then block to endif block */
7587          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7588          BB_then->instructions.emplace_back(std::move(branch));
7589          add_linear_edge(BB_then->index, &BB_endif);
7590          if (!then_branch_divergent)
7591             add_logical_edge(BB_then->index, &BB_endif);
7592          BB_then->kind |= block_kind_uniform;
7593       }
7594
7595       ctx->cf_info.has_branch = false;
7596       ctx->cf_info.parent_loop.has_divergent_branch = false;
7597
7598       /** emit else block */
7599       Block* BB_else = ctx->program->create_and_insert_block();
7600       BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7601       add_edge(BB_if_idx, BB_else);
7602       append_logical_start(BB_else);
7603       ctx->block = BB_else;
7604       visit_cf_list(ctx, &if_stmt->else_list);
7605       BB_else = ctx->block;
7606
7607       if (!ctx->cf_info.has_branch) {
7608          append_logical_end(BB_else);
7609          /* branch from then block to endif block */
7610          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7611          BB_else->instructions.emplace_back(std::move(branch));
7612          add_linear_edge(BB_else->index, &BB_endif);
7613          if (!ctx->cf_info.parent_loop.has_divergent_branch)
7614             add_logical_edge(BB_else->index, &BB_endif);
7615          BB_else->kind |= block_kind_uniform;
7616       }
7617
7618       ctx->cf_info.has_branch &= then_branch;
7619       ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7620
7621       /** emit endif merge block */
7622       if (!ctx->cf_info.has_branch) {
7623          ctx->block = ctx->program->insert_block(std::move(BB_endif));
7624          append_logical_start(ctx->block);
7625       }
7626    } else { /* non-uniform condition */
7627       /**
7628        * To maintain a logical and linear CFG without critical edges,
7629        * non-uniform conditionals are represented in the following way*) :
7630        *
7631        * The linear CFG:
7632        *                        BB_IF
7633        *                        /    \
7634        *       BB_THEN (logical)      BB_THEN (linear)
7635        *                        \    /
7636        *                        BB_INVERT (linear)
7637        *                        /    \
7638        *       BB_ELSE (logical)      BB_ELSE (linear)
7639        *                        \    /
7640        *                        BB_ENDIF
7641        *
7642        * The logical CFG:
7643        *                        BB_IF
7644        *                        /    \
7645        *       BB_THEN (logical)      BB_ELSE (logical)
7646        *                        \    /
7647        *                        BB_ENDIF
7648        *
7649        * *) Exceptions may be due to break and continue statements within loops
7650        **/
7651
7652       if_context ic;
7653
7654       begin_divergent_if_then(ctx, &ic, cond);
7655       visit_cf_list(ctx, &if_stmt->then_list);
7656
7657       begin_divergent_if_else(ctx, &ic);
7658       visit_cf_list(ctx, &if_stmt->else_list);
7659
7660       end_divergent_if(ctx, &ic);
7661    }
7662 }
7663
7664 static void visit_cf_list(isel_context *ctx,
7665                           struct exec_list *list)
7666 {
7667    foreach_list_typed(nir_cf_node, node, node, list) {
7668       switch (node->type) {
7669       case nir_cf_node_block:
7670          visit_block(ctx, nir_cf_node_as_block(node));
7671          break;
7672       case nir_cf_node_if:
7673          visit_if(ctx, nir_cf_node_as_if(node));
7674          break;
7675       case nir_cf_node_loop:
7676          visit_loop(ctx, nir_cf_node_as_loop(node));
7677          break;
7678       default:
7679          unreachable("unimplemented cf list type");
7680       }
7681    }
7682 }
7683
7684 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7685 {
7686    int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7687    uint64_t mask = ctx->vs_output.mask[slot];
7688    if (!is_pos && !mask)
7689       return;
7690    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7691       return;
7692    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7693    exp->enabled_mask = mask;
7694    for (unsigned i = 0; i < 4; ++i) {
7695       if (mask & (1 << i))
7696          exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7697       else
7698          exp->operands[i] = Operand(v1);
7699    }
7700    exp->valid_mask = false;
7701    exp->done = false;
7702    exp->compressed = false;
7703    if (is_pos)
7704       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7705    else
7706       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7707    ctx->block->instructions.emplace_back(std::move(exp));
7708 }
7709
7710 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7711 {
7712    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7713    exp->enabled_mask = 0;
7714    for (unsigned i = 0; i < 4; ++i)
7715       exp->operands[i] = Operand(v1);
7716    if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7717       exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7718       exp->enabled_mask |= 0x1;
7719    }
7720    if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7721       exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7722       exp->enabled_mask |= 0x4;
7723    }
7724    if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7725       if (ctx->options->chip_class < GFX9) {
7726          exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7727          exp->enabled_mask |= 0x8;
7728       } else {
7729          Builder bld(ctx->program, ctx->block);
7730
7731          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7732                              Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7733          if (exp->operands[2].isTemp())
7734             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7735
7736          exp->operands[2] = Operand(out);
7737          exp->enabled_mask |= 0x4;
7738       }
7739    }
7740    exp->valid_mask = false;
7741    exp->done = false;
7742    exp->compressed = false;
7743    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7744    ctx->block->instructions.emplace_back(std::move(exp));
7745 }
7746
7747 static void create_vs_exports(isel_context *ctx)
7748 {
7749    radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7750
7751    if (outinfo->export_prim_id) {
7752       ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7753       ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = get_arg(ctx, ctx->args->vs_prim_id);
7754    }
7755
7756    if (ctx->options->key.has_multiview_view_index) {
7757       ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7758       ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
7759    }
7760
7761    /* the order these position exports are created is important */
7762    int next_pos = 0;
7763    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7764    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7765       export_vs_psiz_layer_viewport(ctx, &next_pos);
7766    }
7767    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7768       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7769    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7770       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7771
7772    if (ctx->options->key.vs_common_out.export_clip_dists) {
7773       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7774          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7775       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7776          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7777    }
7778
7779    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7780       if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7781           i != VARYING_SLOT_PRIMITIVE_ID)
7782          continue;
7783
7784       export_vs_varying(ctx, i, false, NULL);
7785    }
7786 }
7787
7788 static void emit_stream_output(isel_context *ctx,
7789                                Temp const *so_buffers,
7790                                Temp const *so_write_offset,
7791                                const struct radv_stream_output *output)
7792 {
7793    unsigned num_comps = util_bitcount(output->component_mask);
7794    unsigned loc = output->location;
7795    unsigned buf = output->buffer;
7796    unsigned offset = output->offset;
7797
7798    assert(num_comps && num_comps <= 4);
7799    if (!num_comps || num_comps > 4)
7800       return;
7801
7802    unsigned start = ffs(output->component_mask) - 1;
7803
7804    Temp out[4];
7805    bool all_undef = true;
7806    assert(ctx->stage == vertex_vs);
7807    for (unsigned i = 0; i < num_comps; i++) {
7808       out[i] = ctx->vs_output.outputs[loc][start + i];
7809       all_undef = all_undef && !out[i].id();
7810    }
7811    if (all_undef)
7812       return;
7813
7814    Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7815    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7816    for (unsigned i = 0; i < num_comps; ++i)
7817       vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7818    vec->definitions[0] = Definition(write_data);
7819    ctx->block->instructions.emplace_back(std::move(vec));
7820
7821    aco_opcode opcode;
7822    switch (num_comps) {
7823    case 1:
7824       opcode = aco_opcode::buffer_store_dword;
7825       break;
7826    case 2:
7827       opcode = aco_opcode::buffer_store_dwordx2;
7828       break;
7829    case 3:
7830       opcode = aco_opcode::buffer_store_dwordx3;
7831       break;
7832    case 4:
7833       opcode = aco_opcode::buffer_store_dwordx4;
7834       break;
7835    }
7836
7837    aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7838    store->operands[0] = Operand(so_write_offset[buf]);
7839    store->operands[1] = Operand(so_buffers[buf]);
7840    store->operands[2] = Operand((uint32_t) 0);
7841    store->operands[3] = Operand(write_data);
7842    if (offset > 4095) {
7843       /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7844       Builder bld(ctx->program, ctx->block);
7845       store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7846    } else {
7847       store->offset = offset;
7848    }
7849    store->offen = true;
7850    store->glc = true;
7851    store->dlc = false;
7852    store->slc = true;
7853    store->can_reorder = true;
7854    ctx->block->instructions.emplace_back(std::move(store));
7855 }
7856
7857 static void emit_streamout(isel_context *ctx, unsigned stream)
7858 {
7859    Builder bld(ctx->program, ctx->block);
7860
7861    Temp so_buffers[4];
7862    Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
7863    for (unsigned i = 0; i < 4; i++) {
7864       unsigned stride = ctx->program->info->so.strides[i];
7865       if (!stride)
7866          continue;
7867
7868       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7869    }
7870
7871    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7872                                 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
7873
7874    Temp tid = emit_mbcnt(ctx, bld.def(v1));
7875
7876    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7877
7878    if_context ic;
7879    begin_divergent_if_then(ctx, &ic, can_emit);
7880
7881    bld.reset(ctx->block);
7882
7883    Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
7884
7885    Temp so_write_offset[4];
7886
7887    for (unsigned i = 0; i < 4; i++) {
7888       unsigned stride = ctx->program->info->so.strides[i];
7889       if (!stride)
7890          continue;
7891
7892       if (stride == 1) {
7893          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7894                                 get_arg(ctx, ctx->args->streamout_write_idx),
7895                                 get_arg(ctx, ctx->args->streamout_offset[i]));
7896          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7897
7898          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7899       } else {
7900          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7901          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
7902                                  get_arg(ctx, ctx->args->streamout_offset[i]));
7903          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7904       }
7905    }
7906
7907    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7908       struct radv_stream_output *output =
7909          &ctx->program->info->so.outputs[i];
7910       if (stream != output->stream)
7911          continue;
7912
7913       emit_stream_output(ctx, so_buffers, so_write_offset, output);
7914    }
7915
7916    begin_divergent_if_else(ctx, &ic);
7917    end_divergent_if(ctx, &ic);
7918 }
7919
7920 } /* end namespace */
7921
7922 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
7923 {
7924    /* Split all arguments except for the first (ring_offsets) and the last
7925     * (exec) so that the dead channels don't stay live throughout the program.
7926     */
7927    for (unsigned i = 1; i < startpgm->definitions.size() - 1; i++) {
7928       if (startpgm->definitions[i].regClass().size() > 1) {
7929          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
7930                            startpgm->definitions[i].regClass().size());
7931       }
7932    }
7933 }
7934
7935 void handle_bc_optimize(isel_context *ctx)
7936 {
7937    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7938    Builder bld(ctx->program, ctx->block);
7939    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7940    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7941    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7942    ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
7943    ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
7944    if (uses_center && uses_centroid) {
7945       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
7946                               get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
7947
7948       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7949          Temp new_coord[2];
7950          for (unsigned i = 0; i < 2; i++) {
7951             Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
7952             Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
7953             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7954                                     persp_centroid, persp_center, sel);
7955          }
7956          ctx->persp_centroid = bld.tmp(v2);
7957          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
7958                     Operand(new_coord[0]), Operand(new_coord[1]));
7959          emit_split_vector(ctx, ctx->persp_centroid, 2);
7960       }
7961
7962       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7963          Temp new_coord[2];
7964          for (unsigned i = 0; i < 2; i++) {
7965             Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
7966             Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
7967             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7968                                     linear_centroid, linear_center, sel);
7969          }
7970          ctx->linear_centroid = bld.tmp(v2);
7971          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
7972                     Operand(new_coord[0]), Operand(new_coord[1]));
7973          emit_split_vector(ctx, ctx->linear_centroid, 2);
7974       }
7975    }
7976 }
7977
7978 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
7979 {
7980    Program *program = ctx->program;
7981
7982    unsigned float_controls = shader->info.float_controls_execution_mode;
7983
7984    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
7985       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
7986    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
7987       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
7988                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
7989
7990    program->next_fp_mode.must_flush_denorms32 =
7991       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
7992    program->next_fp_mode.must_flush_denorms16_64 =
7993       float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
7994                         FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
7995
7996    program->next_fp_mode.care_about_round32 =
7997       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
7998
7999    program->next_fp_mode.care_about_round16_64 =
8000       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
8001                         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
8002
8003    /* default to preserving fp16 and fp64 denorms, since it's free */
8004    if (program->next_fp_mode.must_flush_denorms16_64)
8005       program->next_fp_mode.denorm16_64 = 0;
8006    else
8007       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
8008
8009    /* preserving fp32 denorms is expensive, so only do it if asked */
8010    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
8011       program->next_fp_mode.denorm32 = fp_denorm_keep;
8012    else
8013       program->next_fp_mode.denorm32 = 0;
8014
8015    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
8016       program->next_fp_mode.round32 = fp_round_tz;
8017    else
8018       program->next_fp_mode.round32 = fp_round_ne;
8019
8020    if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
8021       program->next_fp_mode.round16_64 = fp_round_tz;
8022    else
8023       program->next_fp_mode.round16_64 = fp_round_ne;
8024
8025    ctx->block->fp_mode = program->next_fp_mode;
8026 }
8027
8028 void select_program(Program *program,
8029                     unsigned shader_count,
8030                     struct nir_shader *const *shaders,
8031                     ac_shader_config* config,
8032                     struct radv_shader_args *args)
8033 {
8034    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args);
8035
8036    for (unsigned i = 0; i < shader_count; i++) {
8037       nir_shader *nir = shaders[i];
8038       init_context(&ctx, nir);
8039
8040       setup_fp_mode(&ctx, nir);
8041
8042       if (!i) {
8043          /* needs to be after init_context() for FS */
8044          Pseudo_instruction *startpgm = add_startpgm(&ctx);
8045          append_logical_start(ctx.block);
8046          split_arguments(&ctx, startpgm);
8047       }
8048
8049       if_context ic;
8050       if (shader_count >= 2) {
8051          Builder bld(ctx.program, ctx.block);
8052          Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
8053          Temp thread_id = emit_mbcnt(&ctx, bld.def(v1));
8054          Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(bld.lm)), count, thread_id);
8055
8056          begin_divergent_if_then(&ctx, &ic, cond);
8057       }
8058
8059       if (i) {
8060          Builder bld(ctx.program, ctx.block);
8061          bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
8062          bld.sopp(aco_opcode::s_barrier);
8063       }
8064
8065       if (ctx.stage == fragment_fs)
8066          handle_bc_optimize(&ctx);
8067
8068       nir_function_impl *func = nir_shader_get_entrypoint(nir);
8069       visit_cf_list(&ctx, &func->body);
8070
8071       if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
8072          emit_streamout(&ctx, 0);
8073
8074       if (ctx.stage == vertex_vs)
8075          create_vs_exports(&ctx);
8076
8077       if (shader_count >= 2) {
8078          begin_divergent_if_else(&ctx, &ic);
8079          end_divergent_if(&ctx, &ic);
8080       }
8081
8082       ralloc_free(ctx.divergent_vals);
8083    }
8084
8085    program->config->float_mode = program->blocks[0].fp_mode.val;
8086
8087    append_logical_end(ctx.block);
8088    ctx.block->kind |= block_kind_uniform;
8089    Builder bld(ctx.program, ctx.block);
8090    if (ctx.program->wb_smem_l1_on_end)
8091       bld.smem(aco_opcode::s_dcache_wb, false);
8092    bld.sopp(aco_opcode::s_endpgm);
8093
8094    /* cleanup CFG */
8095    for (Block& BB : program->blocks) {
8096       for (unsigned idx : BB.linear_preds)
8097          program->blocks[idx].linear_succs.emplace_back(BB.index);
8098       for (unsigned idx : BB.logical_preds)
8099          program->blocks[idx].logical_succs.emplace_back(BB.index);
8100    }
8101 }
8102 }