src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <stack>
  29 #include <map>
  30
  31 #include "ac_shader_util.h"
  32 #include "aco_ir.h"
  33 #include "aco_builder.h"
  34 #include "aco_interface.h"
  35 #include "aco_instruction_selection_setup.cpp"
  36 #include "util/fast_idiv_by_const.h"
  37
  38 namespace aco {
  39 namespace {
  40
  41 class loop_info_RAII {
  42    isel_context* ctx;
  43    unsigned header_idx_old;
  44    Block* exit_old;
  45    bool divergent_cont_old;
  46    bool divergent_branch_old;
  47    bool divergent_if_old;
  48
  49 public:
  50    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  51       : ctx(ctx),
  52         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  53         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  54         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  55         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  56    {
  57       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  58       ctx->cf_info.parent_loop.exit = loop_exit;
  59       ctx->cf_info.parent_loop.has_divergent_continue = false;
  60       ctx->cf_info.parent_loop.has_divergent_branch = false;
  61       ctx->cf_info.parent_if.is_divergent = false;
  62       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  63    }
  64
  65    ~loop_info_RAII()
  66    {
  67       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  68       ctx->cf_info.parent_loop.exit = exit_old;
  69       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  70       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  71       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  72       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  73       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  74          ctx->cf_info.exec_potentially_empty_discard = false;
  75    }
  76 };
  77
  78 struct if_context {
  79    Temp cond;
  80
  81    bool divergent_old;
  82    bool exec_potentially_empty_discard_old;
  83    bool exec_potentially_empty_break_old;
  84    uint16_t exec_potentially_empty_break_depth_old;
  85
  86    unsigned BB_if_idx;
  87    unsigned invert_idx;
  88    bool uniform_has_then_branch;
  89    bool then_branch_divergent;
  90    Block BB_invert;
  91    Block BB_endif;
  92 };
  93
  94 static bool visit_cf_list(struct isel_context *ctx,
  95                           struct exec_list *list);
  96
  97 static void add_logical_edge(unsigned pred_idx, Block *succ)
  98 {
  99    succ->logical_preds.emplace_back(pred_idx);
 100 }
 101
 102
 103 static void add_linear_edge(unsigned pred_idx, Block *succ)
 104 {
 105    succ->linear_preds.emplace_back(pred_idx);
 106 }
 107
 108 static void add_edge(unsigned pred_idx, Block *succ)
 109 {
 110    add_logical_edge(pred_idx, succ);
 111    add_linear_edge(pred_idx, succ);
 112 }
 113
 114 static void append_logical_start(Block *b)
 115 {
 116    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 117 }
 118
 119 static void append_logical_end(Block *b)
 120 {
 121    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 122 }
 123
 124 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 125 {
 126    assert(ctx->allocated[def->index].id());
 127    return ctx->allocated[def->index];
 128 }
 129
 130 Temp emit_mbcnt(isel_context *ctx, Definition dst,
 131                 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
 132 {
 133    Builder bld(ctx->program, ctx->block);
 134    Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
 135    Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
 136
 137    if (ctx->program->wave_size == 32) {
 138       return thread_id_lo;
 139    } else if (ctx->program->chip_class <= GFX7) {
 140       Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
 141       return thread_id_hi;
 142    } else {
 143       Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
 144       return thread_id_hi;
 145    }
 146 }
 147
 148 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 149 {
 150    Builder bld(ctx->program, ctx->block);
 151
 152    if (!dst.id())
 153       dst = bld.tmp(src.regClass());
 154
 155    assert(src.size() == dst.size());
 156
 157    if (ctx->stage != fragment_fs) {
 158       if (!dst.id())
 159          return src;
 160
 161       bld.copy(Definition(dst), src);
 162       return dst;
 163    }
 164
 165    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 166    ctx->program->needs_wqm |= program_needs_wqm;
 167    return dst;
 168 }
 169
 170 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
 171 {
 172    if (index.regClass() == s1)
 173       return bld.readlane(bld.def(s1), data, index);
 174
 175    if (ctx->options->chip_class <= GFX7) {
 176       /* GFX6-7: there is no bpermute instruction */
 177       Operand index_op(index);
 178       Operand input_data(data);
 179       index_op.setLateKill(true);
 180       input_data.setLateKill(true);
 181
 182       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
 183    } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
 184       /* GFX10 wave64 mode: emulate full-wave bpermute */
 185       if (!ctx->has_gfx10_wave64_bpermute) {
 186          ctx->has_gfx10_wave64_bpermute = true;
 187          ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
 188          ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
 189       }
 190
 191       Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
 192       Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
 193       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
 194       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
 195       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 196       Operand input_data(data);
 197
 198       index_x4.setLateKill(true);
 199       input_data.setLateKill(true);
 200       same_half.setLateKill(true);
 201
 202       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
 203    } else {
 204       /* GFX8-9 or GFX10 wave32: bpermute works normally */
 205       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 206       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 207    }
 208 }
 209
 210 static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
 211 {
 212    if (ctx->options->chip_class >= GFX8) {
 213       unsigned and_mask = mask & 0x1f;
 214       unsigned or_mask = (mask >> 5) & 0x1f;
 215       unsigned xor_mask = (mask >> 10) & 0x1f;
 216
 217       uint16_t dpp_ctrl = 0xffff;
 218
 219       // TODO: we could use DPP8 for some swizzles
 220       if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
 221          unsigned res[4] = {0, 1, 2, 3};
 222          for (unsigned i = 0; i < 4; i++)
 223             res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
 224          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
 225       } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
 226          dpp_ctrl = dpp_row_rr(8);
 227       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
 228          dpp_ctrl = dpp_row_mirror;
 229       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
 230          dpp_ctrl = dpp_row_half_mirror;
 231       }
 232
 233       if (dpp_ctrl != 0xffff)
 234          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
 235    }
 236
 237    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 238 }
 239
 240 Temp as_vgpr(isel_context *ctx, Temp val)
 241 {
 242    if (val.type() == RegType::sgpr) {
 243       Builder bld(ctx->program, ctx->block);
 244       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 245    }
 246    assert(val.type() == RegType::vgpr);
 247    return val;
 248 }
 249
 250 //assumes a != 0xffffffff
 251 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 252 {
 253    assert(b != 0);
 254    Builder bld(ctx->program, ctx->block);
 255
 256    if (util_is_power_of_two_or_zero(b)) {
 257       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 258       return;
 259    }
 260
 261    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 262
 263    assert(info.multiplier <= 0xffffffff);
 264
 265    bool pre_shift = info.pre_shift != 0;
 266    bool increment = info.increment != 0;
 267    bool multiply = true;
 268    bool post_shift = info.post_shift != 0;
 269
 270    if (!pre_shift && !increment && !multiply && !post_shift) {
 271       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 272       return;
 273    }
 274
 275    Temp pre_shift_dst = a;
 276    if (pre_shift) {
 277       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 278       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 279    }
 280
 281    Temp increment_dst = pre_shift_dst;
 282    if (increment) {
 283       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 284       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 285    }
 286
 287    Temp multiply_dst = increment_dst;
 288    if (multiply) {
 289       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 290       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 291                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 292    }
 293
 294    if (post_shift) {
 295       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 296    }
 297 }
 298
 299 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 300 {
 301    Builder bld(ctx->program, ctx->block);
 302    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 303 }
 304
 305
 306 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 307 {
 308    /* no need to extract the whole vector */
 309    if (src.regClass() == dst_rc) {
 310       assert(idx == 0);
 311       return src;
 312    }
 313
 314    assert(src.bytes() > (idx * dst_rc.bytes()));
 315    Builder bld(ctx->program, ctx->block);
 316    auto it = ctx->allocated_vec.find(src.id());
 317    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
 318       if (it->second[idx].regClass() == dst_rc) {
 319          return it->second[idx];
 320       } else {
 321          assert(!dst_rc.is_subdword());
 322          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 323          return bld.copy(bld.def(dst_rc), it->second[idx]);
 324       }
 325    }
 326
 327    if (dst_rc.is_subdword())
 328       src = as_vgpr(ctx, src);
 329
 330    if (src.bytes() == dst_rc.bytes()) {
 331       assert(idx == 0);
 332       return bld.copy(bld.def(dst_rc), src);
 333    } else {
 334       Temp dst = bld.tmp(dst_rc);
 335       emit_extract_vector(ctx, src, idx, dst);
 336       return dst;
 337    }
 338 }
 339
 340 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 341 {
 342    if (num_components == 1)
 343       return;
 344    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 345       return;
 346    RegClass rc;
 347    if (num_components > vec_src.size()) {
 348       if (vec_src.type() == RegType::sgpr) {
 349          /* should still help get_alu_src() */
 350          emit_split_vector(ctx, vec_src, vec_src.size());
 351          return;
 352       }
 353       /* sub-dword split */
 354       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
 355    } else {
 356       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
 357    }
 358    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 359    split->operands[0] = Operand(vec_src);
 360    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 361    for (unsigned i = 0; i < num_components; i++) {
 362       elems[i] = {ctx->program->allocateId(), rc};
 363       split->definitions[i] = Definition(elems[i]);
 364    }
 365    ctx->block->instructions.emplace_back(std::move(split));
 366    ctx->allocated_vec.emplace(vec_src.id(), elems);
 367 }
 368
 369 /* This vector expansion uses a mask to determine which elements in the new vector
 370  * come from the original vector. The other elements are undefined. */
 371 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 372 {
 373    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 374
 375    if (vec_src == dst)
 376       return;
 377
 378    Builder bld(ctx->program, ctx->block);
 379    if (num_components == 1) {
 380       if (dst.type() == RegType::sgpr)
 381          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 382       else
 383          bld.copy(Definition(dst), vec_src);
 384       return;
 385    }
 386
 387    unsigned component_size = dst.size() / num_components;
 388    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 389
 390    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 391    vec->definitions[0] = Definition(dst);
 392    unsigned k = 0;
 393    for (unsigned i = 0; i < num_components; i++) {
 394       if (mask & (1 << i)) {
 395          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 396          if (dst.type() == RegType::sgpr)
 397             src = bld.as_uniform(src);
 398          vec->operands[i] = Operand(src);
 399       } else {
 400          vec->operands[i] = Operand(0u);
 401       }
 402       elems[i] = vec->operands[i].getTemp();
 403    }
 404    ctx->block->instructions.emplace_back(std::move(vec));
 405    ctx->allocated_vec.emplace(dst.id(), elems);
 406 }
 407
 408 /* adjust misaligned small bit size loads */
 409 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
 410 {
 411    Builder bld(ctx->program, ctx->block);
 412    Operand shift;
 413    Temp select = Temp();
 414    if (offset.isConstant()) {
 415       assert(offset.constantValue() && offset.constantValue() < 4);
 416       shift = Operand(offset.constantValue() * 8);
 417    } else {
 418       /* bit_offset = 8 * (offset & 0x3) */
 419       Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
 420       select = bld.tmp(s1);
 421       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
 422    }
 423
 424    if (vec.size() == 1) {
 425       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
 426    } else if (vec.size() == 2) {
 427       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
 428       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
 429       if (tmp == dst)
 430          emit_split_vector(ctx, dst, 2);
 431       else
 432          emit_extract_vector(ctx, tmp, 0, dst);
 433    } else if (vec.size() == 4) {
 434       Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
 435       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 436       hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
 437       if (select != Temp())
 438          hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
 439       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
 440       Temp mid = bld.tmp(s1);
 441       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
 442       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
 443       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
 444       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
 445       emit_split_vector(ctx, dst, 2);
 446    }
 447 }
 448
 449 void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 450 {
 451    Builder bld(ctx->program, ctx->block);
 452    if (offset.isTemp()) {
 453       Temp tmp[4] = {vec, vec, vec, vec};
 454
 455       if (vec.size() == 4) {
 456          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
 457          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
 458       } else if (vec.size() == 3) {
 459          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
 460          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
 461       } else if (vec.size() == 2) {
 462          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
 463          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
 464       }
 465       for (unsigned i = 0; i < dst.size(); i++)
 466          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
 467
 468       vec = tmp[0];
 469       if (dst.size() == 2)
 470          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
 471
 472       offset = Operand(0u);
 473    }
 474
 475    unsigned num_components = dst.bytes() / component_size;
 476    if (vec.regClass() == dst.regClass()) {
 477       assert(offset.constantValue() == 0);
 478       bld.copy(Definition(dst), vec);
 479       emit_split_vector(ctx, dst, num_components);
 480       return;
 481    }
 482
 483    emit_split_vector(ctx, vec, vec.bytes() / component_size);
 484    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 485    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
 486
 487    assert(offset.constantValue() % component_size == 0);
 488    unsigned skip = offset.constantValue() / component_size;
 489    for (unsigned i = 0; i < num_components; i++)
 490       elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
 491
 492    /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
 493    if (dst.type() == RegType::vgpr) {
 494       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 495       for (unsigned i = 0; i < num_components; i++)
 496          create_vec->operands[i] = Operand(elems[i]);
 497       create_vec->definitions[0] = Definition(dst);
 498       bld.insert(std::move(create_vec));
 499
 500    /* if dst is sgpr - split the src, but move the original to sgpr. */
 501    } else if (skip) {
 502       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
 503       byte_align_scalar(ctx, vec, offset, dst);
 504    } else {
 505       assert(dst.size() == vec.size());
 506       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
 507    }
 508
 509    ctx->allocated_vec.emplace(dst.id(), elems);
 510 }
 511
 512 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
 513 {
 514    Builder bld(ctx->program, ctx->block);
 515    if (!dst.id())
 516       dst = bld.tmp(bld.lm);
 517
 518    assert(val.regClass() == s1);
 519    assert(dst.regClass() == bld.lm);
 520
 521    return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
 522 }
 523
 524 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
 525 {
 526    Builder bld(ctx->program, ctx->block);
 527    if (!dst.id())
 528       dst = bld.tmp(s1);
 529
 530    assert(val.regClass() == bld.lm);
 531    assert(dst.regClass() == s1);
 532
 533    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 534    Temp tmp = bld.tmp(s1);
 535    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
 536    return emit_wqm(ctx, tmp, dst);
 537 }
 538
 539 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 540 {
 541    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 542       return get_ssa_temp(ctx, src.src.ssa);
 543
 544    if (src.src.ssa->num_components == size) {
 545       bool identity_swizzle = true;
 546       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 547          if (src.swizzle[i] != i)
 548             identity_swizzle = false;
 549       }
 550       if (identity_swizzle)
 551          return get_ssa_temp(ctx, src.src.ssa);
 552    }
 553
 554    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 555    unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
 556    assert(elem_size > 0);
 557    assert(vec.bytes() % elem_size == 0);
 558
 559    if (elem_size < 4 && vec.type() == RegType::sgpr) {
 560       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
 561       assert(size == 1);
 562       unsigned swizzle = src.swizzle[0];
 563       if (vec.size() > 1) {
 564          assert(src.src.ssa->bit_size == 16);
 565          vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
 566          swizzle = swizzle & 1;
 567       }
 568       if (swizzle == 0)
 569          return vec;
 570
 571       Temp dst{ctx->program->allocateId(), s1};
 572       aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
 573       bfe->operands[0] = Operand(vec);
 574       bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
 575       bfe->definitions[0] = Definition(dst);
 576       bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 577       ctx->block->instructions.emplace_back(std::move(bfe));
 578       return dst;
 579    }
 580
 581    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
 582    if (size == 1) {
 583       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 584    } else {
 585       assert(size <= 4);
 586       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 587       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 588       for (unsigned i = 0; i < size; ++i) {
 589          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 590          vec_instr->operands[i] = Operand{elems[i]};
 591       }
 592       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
 593       vec_instr->definitions[0] = Definition(dst);
 594       ctx->block->instructions.emplace_back(std::move(vec_instr));
 595       ctx->allocated_vec.emplace(dst.id(), elems);
 596       return dst;
 597    }
 598 }
 599
 600 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 601 {
 602    if (ptr.size() == 2)
 603       return ptr;
 604    Builder bld(ctx->program, ctx->block);
 605    if (ptr.type() == RegType::vgpr)
 606       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 607    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 608                      ptr, Operand((unsigned)ctx->options->address32_hi));
 609 }
 610
 611 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 612 {
 613    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 614    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 615    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 616    sop2->definitions[0] = Definition(dst);
 617    if (instr->no_unsigned_wrap)
 618       sop2->definitions[0].setNUW(true);
 619    if (writes_scc)
 620       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 621    ctx->block->instructions.emplace_back(std::move(sop2));
 622 }
 623
 624 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 625                            bool commutative, bool swap_srcs=false, bool flush_denorms = false)
 626 {
 627    Builder bld(ctx->program, ctx->block);
 628    bld.is_precise = instr->exact;
 629
 630    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 631    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 632    if (src1.type() == RegType::sgpr) {
 633       if (commutative && src0.type() == RegType::vgpr) {
 634          Temp t = src0;
 635          src0 = src1;
 636          src1 = t;
 637       } else {
 638          src1 = as_vgpr(ctx, src1);
 639       }
 640    }
 641
 642    if (flush_denorms && ctx->program->chip_class < GFX9) {
 643       assert(dst.size() == 1);
 644       Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
 645       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 646    } else {
 647       bld.vop2(op, Definition(dst), src0, src1);
 648    }
 649 }
 650
 651 void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
 652                                    aco_opcode op, Temp dst)
 653 {
 654    Builder bld(ctx->program, ctx->block);
 655    bld.is_precise = instr->exact;
 656
 657    Temp src0 = get_alu_src(ctx, instr->src[0]);
 658    Temp src1 = get_alu_src(ctx, instr->src[1]);
 659
 660    if (src1.type() == RegType::sgpr) {
 661       assert(src0.type() == RegType::vgpr);
 662       std::swap(src0, src1);
 663    }
 664
 665    Temp src00 = bld.tmp(src0.type(), 1);
 666    Temp src01 = bld.tmp(src0.type(), 1);
 667    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 668    Temp src10 = bld.tmp(v1);
 669    Temp src11 = bld.tmp(v1);
 670    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 671    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
 672    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
 673    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
 674 }
 675
 676 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 677                             bool flush_denorms = false)
 678 {
 679    Temp src0 = get_alu_src(ctx, instr->src[0]);
 680    Temp src1 = get_alu_src(ctx, instr->src[1]);
 681    Temp src2 = get_alu_src(ctx, instr->src[2]);
 682
 683    /* ensure that the instruction has at most 1 sgpr operand
 684     * The optimizer will inline constants for us */
 685    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 686       src0 = as_vgpr(ctx, src0);
 687    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 688       src1 = as_vgpr(ctx, src1);
 689    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 690       src2 = as_vgpr(ctx, src2);
 691
 692    Builder bld(ctx->program, ctx->block);
 693    bld.is_precise = instr->exact;
 694    if (flush_denorms && ctx->program->chip_class < GFX9) {
 695       assert(dst.size() == 1);
 696       Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
 697       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 698    } else {
 699       bld.vop3(op, Definition(dst), src0, src1, src2);
 700    }
 701 }
 702
 703 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 704 {
 705    Builder bld(ctx->program, ctx->block);
 706    bld.is_precise = instr->exact;
 707    if (dst.type() == RegType::sgpr)
 708       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
 709                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
 710    else
 711       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 712 }
 713
 714 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 715 {
 716    Temp src0 = get_alu_src(ctx, instr->src[0]);
 717    Temp src1 = get_alu_src(ctx, instr->src[1]);
 718    assert(src0.size() == src1.size());
 719
 720    aco_ptr<Instruction> vopc;
 721    if (src1.type() == RegType::sgpr) {
 722       if (src0.type() == RegType::vgpr) {
 723          /* to swap the operands, we might also have to change the opcode */
 724          switch (op) {
 725             case aco_opcode::v_cmp_lt_f16:
 726                op = aco_opcode::v_cmp_gt_f16;
 727                break;
 728             case aco_opcode::v_cmp_ge_f16:
 729                op = aco_opcode::v_cmp_le_f16;
 730                break;
 731             case aco_opcode::v_cmp_lt_i16:
 732                op = aco_opcode::v_cmp_gt_i16;
 733                break;
 734             case aco_opcode::v_cmp_ge_i16:
 735                op = aco_opcode::v_cmp_le_i16;
 736                break;
 737             case aco_opcode::v_cmp_lt_u16:
 738                op = aco_opcode::v_cmp_gt_u16;
 739                break;
 740             case aco_opcode::v_cmp_ge_u16:
 741                op = aco_opcode::v_cmp_le_u16;
 742                break;
 743             case aco_opcode::v_cmp_lt_f32:
 744                op = aco_opcode::v_cmp_gt_f32;
 745                break;
 746             case aco_opcode::v_cmp_ge_f32:
 747                op = aco_opcode::v_cmp_le_f32;
 748                break;
 749             case aco_opcode::v_cmp_lt_i32:
 750                op = aco_opcode::v_cmp_gt_i32;
 751                break;
 752             case aco_opcode::v_cmp_ge_i32:
 753                op = aco_opcode::v_cmp_le_i32;
 754                break;
 755             case aco_opcode::v_cmp_lt_u32:
 756                op = aco_opcode::v_cmp_gt_u32;
 757                break;
 758             case aco_opcode::v_cmp_ge_u32:
 759                op = aco_opcode::v_cmp_le_u32;
 760                break;
 761             case aco_opcode::v_cmp_lt_f64:
 762                op = aco_opcode::v_cmp_gt_f64;
 763                break;
 764             case aco_opcode::v_cmp_ge_f64:
 765                op = aco_opcode::v_cmp_le_f64;
 766                break;
 767             case aco_opcode::v_cmp_lt_i64:
 768                op = aco_opcode::v_cmp_gt_i64;
 769                break;
 770             case aco_opcode::v_cmp_ge_i64:
 771                op = aco_opcode::v_cmp_le_i64;
 772                break;
 773             case aco_opcode::v_cmp_lt_u64:
 774                op = aco_opcode::v_cmp_gt_u64;
 775                break;
 776             case aco_opcode::v_cmp_ge_u64:
 777                op = aco_opcode::v_cmp_le_u64;
 778                break;
 779             default: /* eq and ne are commutative */
 780                break;
 781          }
 782          Temp t = src0;
 783          src0 = src1;
 784          src1 = t;
 785       } else {
 786          src1 = as_vgpr(ctx, src1);
 787       }
 788    }
 789
 790    Builder bld(ctx->program, ctx->block);
 791    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
 792 }
 793
 794 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 795 {
 796    Temp src0 = get_alu_src(ctx, instr->src[0]);
 797    Temp src1 = get_alu_src(ctx, instr->src[1]);
 798    Builder bld(ctx->program, ctx->block);
 799
 800    assert(dst.regClass() == bld.lm);
 801    assert(src0.type() == RegType::sgpr);
 802    assert(src1.type() == RegType::sgpr);
 803    assert(src0.regClass() == src1.regClass());
 804
 805    /* Emit the SALU comparison instruction */
 806    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
 807    /* Turn the result into a per-lane bool */
 808    bool_to_vector_condition(ctx, cmp, dst);
 809 }
 810
 811 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
 812                      aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
 813 {
 814    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
 815    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
 816    bool use_valu = s_op == aco_opcode::num_opcodes ||
 817                    nir_dest_is_divergent(instr->dest.dest) ||
 818                    ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
 819                    ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
 820    aco_opcode op = use_valu ? v_op : s_op;
 821    assert(op != aco_opcode::num_opcodes);
 822    assert(dst.regClass() == ctx->program->lane_mask);
 823
 824    if (use_valu)
 825       emit_vopc_instruction(ctx, instr, op, dst);
 826    else
 827       emit_sopc_instruction(ctx, instr, op, dst);
 828 }
 829
 830 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
 831 {
 832    Builder bld(ctx->program, ctx->block);
 833    Temp src0 = get_alu_src(ctx, instr->src[0]);
 834    Temp src1 = get_alu_src(ctx, instr->src[1]);
 835
 836    assert(dst.regClass() == bld.lm);
 837    assert(src0.regClass() == bld.lm);
 838    assert(src1.regClass() == bld.lm);
 839
 840    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
 841 }
 842
 843 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 844 {
 845    Builder bld(ctx->program, ctx->block);
 846    Temp cond = get_alu_src(ctx, instr->src[0]);
 847    Temp then = get_alu_src(ctx, instr->src[1]);
 848    Temp els = get_alu_src(ctx, instr->src[2]);
 849
 850    assert(cond.regClass() == bld.lm);
 851
 852    if (dst.type() == RegType::vgpr) {
 853       aco_ptr<Instruction> bcsel;
 854       if (dst.size() == 1) {
 855          then = as_vgpr(ctx, then);
 856          els = as_vgpr(ctx, els);
 857
 858          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 859       } else if (dst.size() == 2) {
 860          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 861          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 862          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 863          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 864
 865          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 866          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 867
 868          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 869       } else {
 870          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 871          nir_print_instr(&instr->instr, stderr);
 872          fprintf(stderr, "\n");
 873       }
 874       return;
 875    }
 876
 877    if (instr->dest.dest.ssa.bit_size == 1) {
 878       assert(dst.regClass() == bld.lm);
 879       assert(then.regClass() == bld.lm);
 880       assert(els.regClass() == bld.lm);
 881    }
 882
 883    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
 884       if (dst.regClass() == s1 || dst.regClass() == s2) {
 885          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 886          assert(dst.size() == then.size());
 887          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 888          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
 889       } else {
 890          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 891          nir_print_instr(&instr->instr, stderr);
 892          fprintf(stderr, "\n");
 893       }
 894       return;
 895    }
 896
 897    /* divergent boolean bcsel
 898     * this implements bcsel on bools: dst = s0 ? s1 : s2
 899     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 900    assert(instr->dest.dest.ssa.bit_size == 1);
 901
 902    if (cond.id() != then.id())
 903       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
 904
 905    if (cond.id() == els.id())
 906       bld.sop1(Builder::s_mov, Definition(dst), then);
 907    else
 908       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
 909                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
 910 }
 911
 912 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
 913                     aco_opcode op, uint32_t undo)
 914 {
 915    /* multiply by 16777216 to handle denormals */
 916    Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
 917                                as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
 918    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
 919    scaled = bld.vop1(op, bld.def(v1), scaled);
 920    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
 921
 922    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
 923
 924    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
 925 }
 926
 927 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 928 {
 929    if (ctx->block->fp_mode.denorm32 == 0) {
 930       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
 931       return;
 932    }
 933
 934    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
 935 }
 936
 937 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 938 {
 939    if (ctx->block->fp_mode.denorm32 == 0) {
 940       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
 941       return;
 942    }
 943
 944    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
 945 }
 946
 947 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 948 {
 949    if (ctx->block->fp_mode.denorm32 == 0) {
 950       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
 951       return;
 952    }
 953
 954    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
 955 }
 956
 957 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 958 {
 959    if (ctx->block->fp_mode.denorm32 == 0) {
 960       bld.vop1(aco_opcode::v_log_f32, dst, val);
 961       return;
 962    }
 963
 964    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
 965 }
 966
 967 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 968 {
 969    if (ctx->options->chip_class >= GFX7)
 970       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
 971
 972    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
 973    /* TODO: create more efficient code! */
 974    if (val.type() == RegType::sgpr)
 975       val = as_vgpr(ctx, val);
 976
 977    /* Split the input value. */
 978    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
 979    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
 980
 981    /* Extract the exponent and compute the unbiased value. */
 982    Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
 983    exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
 984
 985    /* Extract the fractional part. */
 986    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
 987    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
 988
 989    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
 990    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
 991
 992    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
 993    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
 994    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
 995    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
 996    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
 997
 998    /* Get the sign bit. */
 999    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
1000
1001    /* Decide the operation to apply depending on the unbiased exponent. */
1002    Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
1003    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
1004    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1005    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
1006    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1007    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1008
1009    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1010 }
1011
1012 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1013 {
1014    if (ctx->options->chip_class >= GFX7)
1015       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1016
1017    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1018     * lowered at NIR level for precision reasons). */
1019    Temp src0 = as_vgpr(ctx, val);
1020
1021    Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
1022    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
1023
1024    Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1025    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1026    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1027
1028    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1029    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1030    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1031    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1032
1033    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1034    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1035
1036    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1037
1038    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1039    static_cast<VOP3A_instruction*>(add)->neg[1] = true;
1040
1041    return add->definitions[0].getTemp();
1042 }
1043
1044 Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
1045    if (!dst.id()) {
1046       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
1047          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
1048       else
1049          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
1050    }
1051
1052    if (dst.bytes() == src.bytes() && dst_bits < src_bits)
1053       return bld.copy(Definition(dst), src);
1054    else if (dst.bytes() < src.bytes())
1055       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
1056
1057    Temp tmp = dst;
1058    if (dst_bits == 64)
1059       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
1060
1061    if (tmp == src) {
1062    } else if (src.regClass() == s1) {
1063       if (is_signed)
1064          bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
1065       else
1066          bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
1067    } else if (ctx->options->chip_class >= GFX8) {
1068       assert(src_bits != 8 || src.regClass() == v1b);
1069       assert(src_bits != 16 || src.regClass() == v2b);
1070       aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
1071       sdwa->operands[0] = Operand(src);
1072       sdwa->definitions[0] = Definition(tmp);
1073       if (is_signed)
1074          sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
1075       else
1076          sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
1077       sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
1078       bld.insert(std::move(sdwa));
1079    } else {
1080       assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
1081       aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
1082       bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
1083    }
1084
1085    if (dst_bits == 64) {
1086       if (is_signed && dst.regClass() == s2) {
1087          Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
1088          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1089       } else if (is_signed && dst.regClass() == v2) {
1090          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
1091          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1092       } else {
1093          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
1094       }
1095    }
1096
1097    return dst;
1098 }
1099
1100 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
1101 {
1102    if (!instr->dest.dest.is_ssa) {
1103       fprintf(stderr, "nir alu dst not in ssa: ");
1104       nir_print_instr(&instr->instr, stderr);
1105       fprintf(stderr, "\n");
1106       abort();
1107    }
1108    Builder bld(ctx->program, ctx->block);
1109    bld.is_precise = instr->exact;
1110    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1111    switch(instr->op) {
1112    case nir_op_vec2:
1113    case nir_op_vec3:
1114    case nir_op_vec4: {
1115       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1116       unsigned num = instr->dest.dest.ssa.num_components;
1117       for (unsigned i = 0; i < num; ++i)
1118          elems[i] = get_alu_src(ctx, instr->src[i]);
1119
1120       if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1121          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1122          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1123          for (unsigned i = 0; i < num; ++i) {
1124             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1125                vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
1126             else
1127                vec->operands[i] = Operand{elems[i]};
1128          }
1129          vec->definitions[0] = Definition(dst);
1130          ctx->block->instructions.emplace_back(std::move(vec));
1131          ctx->allocated_vec.emplace(dst.id(), elems);
1132       } else {
1133          // TODO: that is a bit suboptimal..
1134          Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1135          for (unsigned i = 0; i < num - 1; ++i)
1136             if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
1137                elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1138          for (unsigned i = 0; i < num; ++i) {
1139             unsigned bit = i * instr->dest.dest.ssa.bit_size;
1140             if (bit % 32 == 0) {
1141                elems[bit / 32] = elems[i];
1142             } else {
1143                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1144                                    elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
1145                elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
1146             }
1147          }
1148          if (dst.size() == 1)
1149             bld.copy(Definition(dst), elems[0]);
1150          else
1151             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
1152       }
1153       break;
1154    }
1155    case nir_op_mov: {
1156       Temp src = get_alu_src(ctx, instr->src[0]);
1157       aco_ptr<Instruction> mov;
1158       if (dst.type() == RegType::sgpr) {
1159          if (src.type() == RegType::vgpr)
1160             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1161          else if (src.regClass() == s1)
1162             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
1163          else if (src.regClass() == s2)
1164             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
1165          else
1166             unreachable("wrong src register class for nir_op_imov");
1167       } else {
1168          if (dst.regClass() == v1)
1169             bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
1170          else if (dst.regClass() == v1b ||
1171                   dst.regClass() == v2b ||
1172                   dst.regClass() == v2)
1173             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
1174          else
1175             unreachable("wrong src register class for nir_op_imov");
1176       }
1177       break;
1178    }
1179    case nir_op_inot: {
1180       Temp src = get_alu_src(ctx, instr->src[0]);
1181       if (instr->dest.dest.ssa.bit_size == 1) {
1182          assert(src.regClass() == bld.lm);
1183          assert(dst.regClass() == bld.lm);
1184          /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1185          Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1186          bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1187       } else if (dst.regClass() == v1) {
1188          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1189       } else if (dst.regClass() == v2) {
1190          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1191          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1192          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1193          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1194          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1195       } else if (dst.type() == RegType::sgpr) {
1196          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1197          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1198       } else {
1199          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1200          nir_print_instr(&instr->instr, stderr);
1201          fprintf(stderr, "\n");
1202       }
1203       break;
1204    }
1205    case nir_op_ineg: {
1206       Temp src = get_alu_src(ctx, instr->src[0]);
1207       if (dst.regClass() == v1) {
1208          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1209       } else if (dst.regClass() == s1) {
1210          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1211       } else if (dst.size() == 2) {
1212          Temp src0 = bld.tmp(dst.type(), 1);
1213          Temp src1 = bld.tmp(dst.type(), 1);
1214          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1215
1216          if (dst.regClass() == s2) {
1217             Temp carry = bld.tmp(s1);
1218             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1219             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1220             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1221          } else {
1222             Temp lower = bld.tmp(v1);
1223             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1224             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1225             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1226          }
1227       } else {
1228          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1229          nir_print_instr(&instr->instr, stderr);
1230          fprintf(stderr, "\n");
1231       }
1232       break;
1233    }
1234    case nir_op_iabs: {
1235       if (dst.regClass() == s1) {
1236          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1237       } else if (dst.regClass() == v1) {
1238          Temp src = get_alu_src(ctx, instr->src[0]);
1239          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1240       } else {
1241          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1242          nir_print_instr(&instr->instr, stderr);
1243          fprintf(stderr, "\n");
1244       }
1245       break;
1246    }
1247    case nir_op_isign: {
1248       Temp src = get_alu_src(ctx, instr->src[0]);
1249       if (dst.regClass() == s1) {
1250          Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
1251          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1252       } else if (dst.regClass() == s2) {
1253          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1254          Temp neqz;
1255          if (ctx->program->chip_class >= GFX8)
1256             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1257          else
1258             neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1259          /* SCC gets zero-extended to 64 bit */
1260          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1261       } else if (dst.regClass() == v1) {
1262          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1263       } else if (dst.regClass() == v2) {
1264          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1265          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1266          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1267          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1268          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1269          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1270       } else {
1271          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1272          nir_print_instr(&instr->instr, stderr);
1273          fprintf(stderr, "\n");
1274       }
1275       break;
1276    }
1277    case nir_op_imax: {
1278       if (dst.regClass() == v1) {
1279          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1280       } else if (dst.regClass() == s1) {
1281          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1282       } else {
1283          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1284          nir_print_instr(&instr->instr, stderr);
1285          fprintf(stderr, "\n");
1286       }
1287       break;
1288    }
1289    case nir_op_umax: {
1290       if (dst.regClass() == v1) {
1291          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1292       } else if (dst.regClass() == s1) {
1293          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1294       } else {
1295          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1296          nir_print_instr(&instr->instr, stderr);
1297          fprintf(stderr, "\n");
1298       }
1299       break;
1300    }
1301    case nir_op_imin: {
1302       if (dst.regClass() == v1) {
1303          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1304       } else if (dst.regClass() == s1) {
1305          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1306       } else {
1307          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1308          nir_print_instr(&instr->instr, stderr);
1309          fprintf(stderr, "\n");
1310       }
1311       break;
1312    }
1313    case nir_op_umin: {
1314       if (dst.regClass() == v1) {
1315          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1316       } else if (dst.regClass() == s1) {
1317          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1318       } else {
1319          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1320          nir_print_instr(&instr->instr, stderr);
1321          fprintf(stderr, "\n");
1322       }
1323       break;
1324    }
1325    case nir_op_ior: {
1326       if (instr->dest.dest.ssa.bit_size == 1) {
1327          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1328       } else if (dst.regClass() == v1) {
1329          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1330       } else if (dst.regClass() == v2) {
1331          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1332       } else if (dst.regClass() == s1) {
1333          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1334       } else if (dst.regClass() == s2) {
1335          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1336       } else {
1337          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1338          nir_print_instr(&instr->instr, stderr);
1339          fprintf(stderr, "\n");
1340       }
1341       break;
1342    }
1343    case nir_op_iand: {
1344       if (instr->dest.dest.ssa.bit_size == 1) {
1345          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1346       } else if (dst.regClass() == v1) {
1347          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1348       } else if (dst.regClass() == v2) {
1349          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1350       } else if (dst.regClass() == s1) {
1351          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1352       } else if (dst.regClass() == s2) {
1353          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1354       } else {
1355          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1356          nir_print_instr(&instr->instr, stderr);
1357          fprintf(stderr, "\n");
1358       }
1359       break;
1360    }
1361    case nir_op_ixor: {
1362       if (instr->dest.dest.ssa.bit_size == 1) {
1363          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1364       } else if (dst.regClass() == v1) {
1365          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1366       } else if (dst.regClass() == v2) {
1367          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1368       } else if (dst.regClass() == s1) {
1369          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1370       } else if (dst.regClass() == s2) {
1371          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1372       } else {
1373          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1374          nir_print_instr(&instr->instr, stderr);
1375          fprintf(stderr, "\n");
1376       }
1377       break;
1378    }
1379    case nir_op_ushr: {
1380       if (dst.regClass() == v1) {
1381          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1382       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1383          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1384                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1385       } else if (dst.regClass() == v2) {
1386          bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1387                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1388       } else if (dst.regClass() == s2) {
1389          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1390       } else if (dst.regClass() == s1) {
1391          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1392       } else {
1393          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1394          nir_print_instr(&instr->instr, stderr);
1395          fprintf(stderr, "\n");
1396       }
1397       break;
1398    }
1399    case nir_op_ishl: {
1400       if (dst.regClass() == v1) {
1401          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1402       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1403          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1404                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1405       } else if (dst.regClass() == v2) {
1406          bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1407                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1408       } else if (dst.regClass() == s1) {
1409          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1410       } else if (dst.regClass() == s2) {
1411          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1412       } else {
1413          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1414          nir_print_instr(&instr->instr, stderr);
1415          fprintf(stderr, "\n");
1416       }
1417       break;
1418    }
1419    case nir_op_ishr: {
1420       if (dst.regClass() == v1) {
1421          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1422       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1423          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1424                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1425       } else if (dst.regClass() == v2) {
1426          bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1427                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1428       } else if (dst.regClass() == s1) {
1429          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1430       } else if (dst.regClass() == s2) {
1431          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1432       } else {
1433          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1434          nir_print_instr(&instr->instr, stderr);
1435          fprintf(stderr, "\n");
1436       }
1437       break;
1438    }
1439    case nir_op_find_lsb: {
1440       Temp src = get_alu_src(ctx, instr->src[0]);
1441       if (src.regClass() == s1) {
1442          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1443       } else if (src.regClass() == v1) {
1444          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1445       } else if (src.regClass() == s2) {
1446          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1447       } else {
1448          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1449          nir_print_instr(&instr->instr, stderr);
1450          fprintf(stderr, "\n");
1451       }
1452       break;
1453    }
1454    case nir_op_ufind_msb:
1455    case nir_op_ifind_msb: {
1456       Temp src = get_alu_src(ctx, instr->src[0]);
1457       if (src.regClass() == s1 || src.regClass() == s2) {
1458          aco_opcode op = src.regClass() == s2 ?
1459                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1460                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1461          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1462
1463          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1464                                         Operand(src.size() * 32u - 1u), msb_rev);
1465          Temp msb = sub.def(0).getTemp();
1466          Temp carry = sub.def(1).getTemp();
1467
1468          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1469       } else if (src.regClass() == v1) {
1470          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1471          Temp msb_rev = bld.tmp(v1);
1472          emit_vop1_instruction(ctx, instr, op, msb_rev);
1473          Temp msb = bld.tmp(v1);
1474          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1475          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1476       } else {
1477          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1478          nir_print_instr(&instr->instr, stderr);
1479          fprintf(stderr, "\n");
1480       }
1481       break;
1482    }
1483    case nir_op_bitfield_reverse: {
1484       if (dst.regClass() == s1) {
1485          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1486       } else if (dst.regClass() == v1) {
1487          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1488       } else {
1489          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1490          nir_print_instr(&instr->instr, stderr);
1491          fprintf(stderr, "\n");
1492       }
1493       break;
1494    }
1495    case nir_op_iadd: {
1496       if (dst.regClass() == s1) {
1497          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1498          break;
1499       }
1500
1501       Temp src0 = get_alu_src(ctx, instr->src[0]);
1502       Temp src1 = get_alu_src(ctx, instr->src[1]);
1503       if (dst.regClass() == v1) {
1504          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1505          break;
1506       }
1507
1508       assert(src0.size() == 2 && src1.size() == 2);
1509       Temp src00 = bld.tmp(src0.type(), 1);
1510       Temp src01 = bld.tmp(dst.type(), 1);
1511       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1512       Temp src10 = bld.tmp(src1.type(), 1);
1513       Temp src11 = bld.tmp(dst.type(), 1);
1514       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1515
1516       if (dst.regClass() == s2) {
1517          Temp carry = bld.tmp(s1);
1518          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1519          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1520          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1521       } else if (dst.regClass() == v2) {
1522          Temp dst0 = bld.tmp(v1);
1523          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1524          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1525          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1526       } else {
1527          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1528          nir_print_instr(&instr->instr, stderr);
1529          fprintf(stderr, "\n");
1530       }
1531       break;
1532    }
1533    case nir_op_uadd_sat: {
1534       Temp src0 = get_alu_src(ctx, instr->src[0]);
1535       Temp src1 = get_alu_src(ctx, instr->src[1]);
1536       if (dst.regClass() == s1) {
1537          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1538          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1539                   src0, src1);
1540          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1541       } else if (dst.regClass() == v1) {
1542          if (ctx->options->chip_class >= GFX9) {
1543             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1544             add->operands[0] = Operand(src0);
1545             add->operands[1] = Operand(src1);
1546             add->definitions[0] = Definition(dst);
1547             add->clamp = 1;
1548             ctx->block->instructions.emplace_back(std::move(add));
1549          } else {
1550             if (src1.regClass() != v1)
1551                std::swap(src0, src1);
1552             assert(src1.regClass() == v1);
1553             Temp tmp = bld.tmp(v1);
1554             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1555             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1556          }
1557       } else {
1558          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1559          nir_print_instr(&instr->instr, stderr);
1560          fprintf(stderr, "\n");
1561       }
1562       break;
1563    }
1564    case nir_op_uadd_carry: {
1565       Temp src0 = get_alu_src(ctx, instr->src[0]);
1566       Temp src1 = get_alu_src(ctx, instr->src[1]);
1567       if (dst.regClass() == s1) {
1568          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1569          break;
1570       }
1571       if (dst.regClass() == v1) {
1572          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1573          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1574          break;
1575       }
1576
1577       Temp src00 = bld.tmp(src0.type(), 1);
1578       Temp src01 = bld.tmp(dst.type(), 1);
1579       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1580       Temp src10 = bld.tmp(src1.type(), 1);
1581       Temp src11 = bld.tmp(dst.type(), 1);
1582       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1583       if (dst.regClass() == s2) {
1584          Temp carry = bld.tmp(s1);
1585          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1586          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1587          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1588       } else if (dst.regClass() == v2) {
1589          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1590          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1591          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1592          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1593       } else {
1594          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1595          nir_print_instr(&instr->instr, stderr);
1596          fprintf(stderr, "\n");
1597       }
1598       break;
1599    }
1600    case nir_op_isub: {
1601       if (dst.regClass() == s1) {
1602          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1603          break;
1604       }
1605
1606       Temp src0 = get_alu_src(ctx, instr->src[0]);
1607       Temp src1 = get_alu_src(ctx, instr->src[1]);
1608       if (dst.regClass() == v1) {
1609          bld.vsub32(Definition(dst), src0, src1);
1610          break;
1611       }
1612
1613       Temp src00 = bld.tmp(src0.type(), 1);
1614       Temp src01 = bld.tmp(dst.type(), 1);
1615       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1616       Temp src10 = bld.tmp(src1.type(), 1);
1617       Temp src11 = bld.tmp(dst.type(), 1);
1618       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1619       if (dst.regClass() == s2) {
1620          Temp carry = bld.tmp(s1);
1621          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1622          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1623          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1624       } else if (dst.regClass() == v2) {
1625          Temp lower = bld.tmp(v1);
1626          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1627          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1628          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1629       } else {
1630          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1631          nir_print_instr(&instr->instr, stderr);
1632          fprintf(stderr, "\n");
1633       }
1634       break;
1635    }
1636    case nir_op_usub_borrow: {
1637       Temp src0 = get_alu_src(ctx, instr->src[0]);
1638       Temp src1 = get_alu_src(ctx, instr->src[1]);
1639       if (dst.regClass() == s1) {
1640          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1641          break;
1642       } else if (dst.regClass() == v1) {
1643          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1644          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1645          break;
1646       }
1647
1648       Temp src00 = bld.tmp(src0.type(), 1);
1649       Temp src01 = bld.tmp(dst.type(), 1);
1650       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1651       Temp src10 = bld.tmp(src1.type(), 1);
1652       Temp src11 = bld.tmp(dst.type(), 1);
1653       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1654       if (dst.regClass() == s2) {
1655          Temp borrow = bld.tmp(s1);
1656          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1657          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1658          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1659       } else if (dst.regClass() == v2) {
1660          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1661          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1662          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1663          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1664       } else {
1665          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1666          nir_print_instr(&instr->instr, stderr);
1667          fprintf(stderr, "\n");
1668       }
1669       break;
1670    }
1671    case nir_op_imul: {
1672       if (dst.regClass() == v1) {
1673          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1674                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1675       } else if (dst.regClass() == s1) {
1676          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1677       } else {
1678          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1679          nir_print_instr(&instr->instr, stderr);
1680          fprintf(stderr, "\n");
1681       }
1682       break;
1683    }
1684    case nir_op_umul_high: {
1685       if (dst.regClass() == v1) {
1686          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1687       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1688          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1689       } else if (dst.regClass() == s1) {
1690          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1691                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1692          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1693       } else {
1694          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1695          nir_print_instr(&instr->instr, stderr);
1696          fprintf(stderr, "\n");
1697       }
1698       break;
1699    }
1700    case nir_op_imul_high: {
1701       if (dst.regClass() == v1) {
1702          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1703       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1704          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1705       } else if (dst.regClass() == s1) {
1706          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1707                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1708          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1709       } else {
1710          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1711          nir_print_instr(&instr->instr, stderr);
1712          fprintf(stderr, "\n");
1713       }
1714       break;
1715    }
1716    case nir_op_fmul: {
1717       Temp src0 = get_alu_src(ctx, instr->src[0]);
1718       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1719       if (dst.regClass() == v2b) {
1720          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
1721       } else if (dst.regClass() == v1) {
1722          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1723       } else if (dst.regClass() == v2) {
1724          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1725       } else {
1726          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1727          nir_print_instr(&instr->instr, stderr);
1728          fprintf(stderr, "\n");
1729       }
1730       break;
1731    }
1732    case nir_op_fadd: {
1733       Temp src0 = get_alu_src(ctx, instr->src[0]);
1734       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1735       if (dst.regClass() == v2b) {
1736          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
1737       } else if (dst.regClass() == v1) {
1738          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1739       } else if (dst.regClass() == v2) {
1740          bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1741       } else {
1742          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1743          nir_print_instr(&instr->instr, stderr);
1744          fprintf(stderr, "\n");
1745       }
1746       break;
1747    }
1748    case nir_op_fsub: {
1749       Temp src0 = get_alu_src(ctx, instr->src[0]);
1750       Temp src1 = get_alu_src(ctx, instr->src[1]);
1751       if (dst.regClass() == v2b) {
1752          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1753             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
1754          else
1755             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
1756       } else if (dst.regClass() == v1) {
1757          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1758             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1759          else
1760             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1761       } else if (dst.regClass() == v2) {
1762          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1763                                      as_vgpr(ctx, src0), as_vgpr(ctx, src1));
1764          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1765          sub->neg[1] = true;
1766       } else {
1767          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1768          nir_print_instr(&instr->instr, stderr);
1769          fprintf(stderr, "\n");
1770       }
1771       break;
1772    }
1773    case nir_op_fmax: {
1774       Temp src0 = get_alu_src(ctx, instr->src[0]);
1775       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1776       if (dst.regClass() == v2b) {
1777          // TODO: check fp_mode.must_flush_denorms16_64
1778          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
1779       } else if (dst.regClass() == v1) {
1780          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1781       } else if (dst.regClass() == v2) {
1782          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1783             Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1784             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1785          } else {
1786             bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1787          }
1788       } else {
1789          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1790          nir_print_instr(&instr->instr, stderr);
1791          fprintf(stderr, "\n");
1792       }
1793       break;
1794    }
1795    case nir_op_fmin: {
1796       Temp src0 = get_alu_src(ctx, instr->src[0]);
1797       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1798       if (dst.regClass() == v2b) {
1799          // TODO: check fp_mode.must_flush_denorms16_64
1800          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
1801       } else if (dst.regClass() == v1) {
1802          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1803       } else if (dst.regClass() == v2) {
1804          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1805             Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1806             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1807          } else {
1808             bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1809          }
1810       } else {
1811          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1812          nir_print_instr(&instr->instr, stderr);
1813          fprintf(stderr, "\n");
1814       }
1815       break;
1816    }
1817    case nir_op_fmax3: {
1818       if (dst.regClass() == v2b) {
1819          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
1820       } else if (dst.regClass() == v1) {
1821          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1822       } else {
1823          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1824          nir_print_instr(&instr->instr, stderr);
1825          fprintf(stderr, "\n");
1826       }
1827       break;
1828    }
1829    case nir_op_fmin3: {
1830       if (dst.regClass() == v2b) {
1831          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
1832       } else if (dst.regClass() == v1) {
1833          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1834       } else {
1835          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1836          nir_print_instr(&instr->instr, stderr);
1837          fprintf(stderr, "\n");
1838       }
1839       break;
1840    }
1841    case nir_op_fmed3: {
1842       if (dst.regClass() == v2b) {
1843          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
1844       } else if (dst.regClass() == v1) {
1845          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1846       } else {
1847          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1848          nir_print_instr(&instr->instr, stderr);
1849          fprintf(stderr, "\n");
1850       }
1851       break;
1852    }
1853    case nir_op_umax3: {
1854       if (dst.size() == 1) {
1855          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1856       } else {
1857          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1858          nir_print_instr(&instr->instr, stderr);
1859          fprintf(stderr, "\n");
1860       }
1861       break;
1862    }
1863    case nir_op_umin3: {
1864       if (dst.size() == 1) {
1865          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1866       } else {
1867          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1868          nir_print_instr(&instr->instr, stderr);
1869          fprintf(stderr, "\n");
1870       }
1871       break;
1872    }
1873    case nir_op_umed3: {
1874       if (dst.size() == 1) {
1875          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1876       } else {
1877          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1878          nir_print_instr(&instr->instr, stderr);
1879          fprintf(stderr, "\n");
1880       }
1881       break;
1882    }
1883    case nir_op_imax3: {
1884       if (dst.size() == 1) {
1885          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1886       } else {
1887          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1888          nir_print_instr(&instr->instr, stderr);
1889          fprintf(stderr, "\n");
1890       }
1891       break;
1892    }
1893    case nir_op_imin3: {
1894       if (dst.size() == 1) {
1895          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1896       } else {
1897          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1898          nir_print_instr(&instr->instr, stderr);
1899          fprintf(stderr, "\n");
1900       }
1901       break;
1902    }
1903    case nir_op_imed3: {
1904       if (dst.size() == 1) {
1905          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1906       } else {
1907          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1908          nir_print_instr(&instr->instr, stderr);
1909          fprintf(stderr, "\n");
1910       }
1911       break;
1912    }
1913    case nir_op_cube_face_coord: {
1914       Temp in = get_alu_src(ctx, instr->src[0], 3);
1915       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1916                       emit_extract_vector(ctx, in, 1, v1),
1917                       emit_extract_vector(ctx, in, 2, v1) };
1918       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1919       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1920       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1921       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1922       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1923       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1924       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1925       break;
1926    }
1927    case nir_op_cube_face_index: {
1928       Temp in = get_alu_src(ctx, instr->src[0], 3);
1929       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1930                       emit_extract_vector(ctx, in, 1, v1),
1931                       emit_extract_vector(ctx, in, 2, v1) };
1932       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1933       break;
1934    }
1935    case nir_op_bcsel: {
1936       emit_bcsel(ctx, instr, dst);
1937       break;
1938    }
1939    case nir_op_frsq: {
1940       Temp src = get_alu_src(ctx, instr->src[0]);
1941       if (dst.regClass() == v2b) {
1942          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
1943       } else if (dst.regClass() == v1) {
1944          emit_rsq(ctx, bld, Definition(dst), src);
1945       } else if (dst.regClass() == v2) {
1946          /* Lowered at NIR level for precision reasons. */
1947          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1948       } else {
1949          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1950          nir_print_instr(&instr->instr, stderr);
1951          fprintf(stderr, "\n");
1952       }
1953       break;
1954    }
1955    case nir_op_fneg: {
1956       Temp src = get_alu_src(ctx, instr->src[0]);
1957       if (dst.regClass() == v2b) {
1958          if (ctx->block->fp_mode.must_flush_denorms16_64)
1959             src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1960          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
1961       } else if (dst.regClass() == v1) {
1962          if (ctx->block->fp_mode.must_flush_denorms32)
1963             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1964          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1965       } else if (dst.regClass() == v2) {
1966          if (ctx->block->fp_mode.must_flush_denorms16_64)
1967             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1968          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1969          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1970          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1971          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1972       } else {
1973          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1974          nir_print_instr(&instr->instr, stderr);
1975          fprintf(stderr, "\n");
1976       }
1977       break;
1978    }
1979    case nir_op_fabs: {
1980       Temp src = get_alu_src(ctx, instr->src[0]);
1981       if (dst.regClass() == v2b) {
1982          if (ctx->block->fp_mode.must_flush_denorms16_64)
1983             src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1984          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
1985       } else if (dst.regClass() == v1) {
1986          if (ctx->block->fp_mode.must_flush_denorms32)
1987             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1988          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1989       } else if (dst.regClass() == v2) {
1990          if (ctx->block->fp_mode.must_flush_denorms16_64)
1991             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1992          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1993          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1994          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1995          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1996       } else {
1997          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1998          nir_print_instr(&instr->instr, stderr);
1999          fprintf(stderr, "\n");
2000       }
2001       break;
2002    }
2003    case nir_op_fsat: {
2004       Temp src = get_alu_src(ctx, instr->src[0]);
2005       if (dst.regClass() == v2b) {
2006          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
2007       } else if (dst.regClass() == v1) {
2008          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2009          /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
2010          // TODO: confirm that this holds under any circumstances
2011       } else if (dst.regClass() == v2) {
2012          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
2013          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
2014          vop3->clamp = true;
2015       } else {
2016          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2017          nir_print_instr(&instr->instr, stderr);
2018          fprintf(stderr, "\n");
2019       }
2020       break;
2021    }
2022    case nir_op_flog2: {
2023       Temp src = get_alu_src(ctx, instr->src[0]);
2024       if (dst.regClass() == v2b) {
2025          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2026       } else if (dst.regClass() == v1) {
2027          emit_log2(ctx, bld, Definition(dst), src);
2028       } else {
2029          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2030          nir_print_instr(&instr->instr, stderr);
2031          fprintf(stderr, "\n");
2032       }
2033       break;
2034    }
2035    case nir_op_frcp: {
2036       Temp src = get_alu_src(ctx, instr->src[0]);
2037       if (dst.regClass() == v2b) {
2038          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2039       } else if (dst.regClass() == v1) {
2040          emit_rcp(ctx, bld, Definition(dst), src);
2041       } else if (dst.regClass() == v2) {
2042          /* Lowered at NIR level for precision reasons. */
2043          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2044       } else {
2045          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2046          nir_print_instr(&instr->instr, stderr);
2047          fprintf(stderr, "\n");
2048       }
2049       break;
2050    }
2051    case nir_op_fexp2: {
2052       if (dst.regClass() == v2b) {
2053          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2054       } else if (dst.regClass() == v1) {
2055          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2056       } else {
2057          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2058          nir_print_instr(&instr->instr, stderr);
2059          fprintf(stderr, "\n");
2060       }
2061       break;
2062    }
2063    case nir_op_fsqrt: {
2064       Temp src = get_alu_src(ctx, instr->src[0]);
2065       if (dst.regClass() == v2b) {
2066          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2067       } else if (dst.regClass() == v1) {
2068          emit_sqrt(ctx, bld, Definition(dst), src);
2069       } else if (dst.regClass() == v2) {
2070          /* Lowered at NIR level for precision reasons. */
2071          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2072       } else {
2073          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2074          nir_print_instr(&instr->instr, stderr);
2075          fprintf(stderr, "\n");
2076       }
2077       break;
2078    }
2079    case nir_op_ffract: {
2080       if (dst.regClass() == v2b) {
2081          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2082       } else if (dst.regClass() == v1) {
2083          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2084       } else if (dst.regClass() == v2) {
2085          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2086       } else {
2087          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2088          nir_print_instr(&instr->instr, stderr);
2089          fprintf(stderr, "\n");
2090       }
2091       break;
2092    }
2093    case nir_op_ffloor: {
2094       Temp src = get_alu_src(ctx, instr->src[0]);
2095       if (dst.regClass() == v2b) {
2096          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2097       } else if (dst.regClass() == v1) {
2098          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2099       } else if (dst.regClass() == v2) {
2100          emit_floor_f64(ctx, bld, Definition(dst), src);
2101       } else {
2102          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2103          nir_print_instr(&instr->instr, stderr);
2104          fprintf(stderr, "\n");
2105       }
2106       break;
2107    }
2108    case nir_op_fceil: {
2109       Temp src0 = get_alu_src(ctx, instr->src[0]);
2110       if (dst.regClass() == v2b) {
2111          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2112       } else if (dst.regClass() == v1) {
2113          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2114       } else if (dst.regClass() == v2) {
2115          if (ctx->options->chip_class >= GFX7) {
2116             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2117          } else {
2118             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2119             /* trunc = trunc(src0)
2120              * if (src0 > 0.0 && src0 != trunc)
2121              *    trunc += 1.0
2122              */
2123             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2124             Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
2125             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2126             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
2127             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
2128             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
2129             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2130          }
2131       } else {
2132          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2133          nir_print_instr(&instr->instr, stderr);
2134          fprintf(stderr, "\n");
2135       }
2136       break;
2137    }
2138    case nir_op_ftrunc: {
2139       Temp src = get_alu_src(ctx, instr->src[0]);
2140       if (dst.regClass() == v2b) {
2141          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2142       } else if (dst.regClass() == v1) {
2143          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2144       } else if (dst.regClass() == v2) {
2145          emit_trunc_f64(ctx, bld, Definition(dst), src);
2146       } else {
2147          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2148          nir_print_instr(&instr->instr, stderr);
2149          fprintf(stderr, "\n");
2150       }
2151       break;
2152    }
2153    case nir_op_fround_even: {
2154       Temp src0 = get_alu_src(ctx, instr->src[0]);
2155       if (dst.regClass() == v2b) {
2156          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2157       } else if (dst.regClass() == v1) {
2158          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2159       } else if (dst.regClass() == v2) {
2160          if (ctx->options->chip_class >= GFX7) {
2161             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2162          } else {
2163             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2164             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2165             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2166
2167             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
2168             Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
2169             Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2170             Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2171             static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
2172             tmp = sub->definitions[0].getTemp();
2173
2174             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2175             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2176             static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2177             Temp cond = vop3->definitions[0].getTemp();
2178
2179             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2180             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2181             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2182             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2183
2184             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2185          }
2186       } else {
2187          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2188          nir_print_instr(&instr->instr, stderr);
2189          fprintf(stderr, "\n");
2190       }
2191       break;
2192    }
2193    case nir_op_fsin:
2194    case nir_op_fcos: {
2195       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2196       aco_ptr<Instruction> norm;
2197       if (dst.regClass() == v2b) {
2198          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
2199          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2200          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2201          bld.vop1(opcode, Definition(dst), tmp);
2202       } else if (dst.regClass() == v1) {
2203          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2204          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2205
2206          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2207          if (ctx->options->chip_class < GFX9)
2208             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2209
2210          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2211          bld.vop1(opcode, Definition(dst), tmp);
2212       } else {
2213          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2214          nir_print_instr(&instr->instr, stderr);
2215          fprintf(stderr, "\n");
2216       }
2217       break;
2218    }
2219    case nir_op_ldexp: {
2220       Temp src0 = get_alu_src(ctx, instr->src[0]);
2221       Temp src1 = get_alu_src(ctx, instr->src[1]);
2222       if (dst.regClass() == v2b) {
2223          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2224       } else if (dst.regClass() == v1) {
2225          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
2226       } else if (dst.regClass() == v2) {
2227          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
2228       } else {
2229          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2230          nir_print_instr(&instr->instr, stderr);
2231          fprintf(stderr, "\n");
2232       }
2233       break;
2234    }
2235    case nir_op_frexp_sig: {
2236       Temp src = get_alu_src(ctx, instr->src[0]);
2237       if (dst.regClass() == v2b) {
2238          bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
2239       } else if (dst.regClass() == v1) {
2240          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2241       } else if (dst.regClass() == v2) {
2242          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2243       } else {
2244          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2245          nir_print_instr(&instr->instr, stderr);
2246          fprintf(stderr, "\n");
2247       }
2248       break;
2249    }
2250    case nir_op_frexp_exp: {
2251       Temp src = get_alu_src(ctx, instr->src[0]);
2252       if (instr->src[0].src.ssa->bit_size == 16) {
2253          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2254          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
2255          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2256       } else if (instr->src[0].src.ssa->bit_size == 32) {
2257          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2258       } else if (instr->src[0].src.ssa->bit_size == 64) {
2259          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2260       } else {
2261          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2262          nir_print_instr(&instr->instr, stderr);
2263          fprintf(stderr, "\n");
2264       }
2265       break;
2266    }
2267    case nir_op_fsign: {
2268       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2269       if (dst.regClass() == v2b) {
2270          Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2271          Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
2272          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2273          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
2274          cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2275          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
2276       } else if (dst.regClass() == v1) {
2277          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2278          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2279          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2280          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2281       } else if (dst.regClass() == v2) {
2282          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2283          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2284          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2285
2286          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2287          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2288          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2289
2290          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2291       } else {
2292          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2293          nir_print_instr(&instr->instr, stderr);
2294          fprintf(stderr, "\n");
2295       }
2296       break;
2297    }
2298    case nir_op_f2f16:
2299    case nir_op_f2f16_rtne: {
2300       Temp src = get_alu_src(ctx, instr->src[0]);
2301       if (instr->src[0].src.ssa->bit_size == 64)
2302          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2303       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2304          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2305           * keep value numbering and the scheduler simpler.
2306           */
2307          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2308       else
2309          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2310       break;
2311    }
2312    case nir_op_f2f16_rtz: {
2313       Temp src = get_alu_src(ctx, instr->src[0]);
2314       if (instr->src[0].src.ssa->bit_size == 64)
2315          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2316       bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
2317       break;
2318    }
2319    case nir_op_f2f32: {
2320       if (instr->src[0].src.ssa->bit_size == 16) {
2321          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2322       } else if (instr->src[0].src.ssa->bit_size == 64) {
2323          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2324       } else {
2325          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2326          nir_print_instr(&instr->instr, stderr);
2327          fprintf(stderr, "\n");
2328       }
2329       break;
2330    }
2331    case nir_op_f2f64: {
2332       Temp src = get_alu_src(ctx, instr->src[0]);
2333       if (instr->src[0].src.ssa->bit_size == 16)
2334          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2335       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2336       break;
2337    }
2338    case nir_op_i2f16: {
2339       assert(dst.regClass() == v2b);
2340       Temp src = get_alu_src(ctx, instr->src[0]);
2341       if (instr->src[0].src.ssa->bit_size == 8)
2342          src = convert_int(ctx, bld, src, 8, 16, true);
2343       else if (instr->src[0].src.ssa->bit_size == 64)
2344          src = convert_int(ctx, bld, src, 64, 32, false);
2345       bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2346       break;
2347    }
2348    case nir_op_i2f32: {
2349       assert(dst.size() == 1);
2350       Temp src = get_alu_src(ctx, instr->src[0]);
2351       if (instr->src[0].src.ssa->bit_size <= 16)
2352          src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2353       bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2354       break;
2355    }
2356    case nir_op_i2f64: {
2357       if (instr->src[0].src.ssa->bit_size <= 32) {
2358          Temp src = get_alu_src(ctx, instr->src[0]);
2359          if (instr->src[0].src.ssa->bit_size <= 16)
2360             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2361          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2362       } else if (instr->src[0].src.ssa->bit_size == 64) {
2363          Temp src = get_alu_src(ctx, instr->src[0]);
2364          RegClass rc = RegClass(src.type(), 1);
2365          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2366          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2367          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2368          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2369          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2370          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2371
2372       } else {
2373          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2374          nir_print_instr(&instr->instr, stderr);
2375          fprintf(stderr, "\n");
2376       }
2377       break;
2378    }
2379    case nir_op_u2f16: {
2380       assert(dst.regClass() == v2b);
2381       Temp src = get_alu_src(ctx, instr->src[0]);
2382       if (instr->src[0].src.ssa->bit_size == 8)
2383          src = convert_int(ctx, bld, src, 8, 16, false);
2384       else if (instr->src[0].src.ssa->bit_size == 64)
2385          src = convert_int(ctx, bld, src, 64, 32, false);
2386       bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2387       break;
2388    }
2389    case nir_op_u2f32: {
2390       assert(dst.size() == 1);
2391       Temp src = get_alu_src(ctx, instr->src[0]);
2392       if (instr->src[0].src.ssa->bit_size == 8) {
2393          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2394       } else {
2395          if (instr->src[0].src.ssa->bit_size == 16)
2396             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2397          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2398       }
2399       break;
2400    }
2401    case nir_op_u2f64: {
2402       if (instr->src[0].src.ssa->bit_size <= 32) {
2403          Temp src = get_alu_src(ctx, instr->src[0]);
2404          if (instr->src[0].src.ssa->bit_size <= 16)
2405             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2406          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2407       } else if (instr->src[0].src.ssa->bit_size == 64) {
2408          Temp src = get_alu_src(ctx, instr->src[0]);
2409          RegClass rc = RegClass(src.type(), 1);
2410          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2411          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2412          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2413          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2414          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2415          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2416       } else {
2417          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2418          nir_print_instr(&instr->instr, stderr);
2419          fprintf(stderr, "\n");
2420       }
2421       break;
2422    }
2423    case nir_op_f2i8:
2424    case nir_op_f2i16: {
2425       if (instr->src[0].src.ssa->bit_size == 16)
2426          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2427       else if (instr->src[0].src.ssa->bit_size == 32)
2428          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2429       else
2430          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2431       break;
2432    }
2433    case nir_op_f2u8:
2434    case nir_op_f2u16: {
2435       if (instr->src[0].src.ssa->bit_size == 16)
2436          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2437       else if (instr->src[0].src.ssa->bit_size == 32)
2438          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2439       else
2440          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2441       break;
2442    }
2443    case nir_op_f2i32: {
2444       Temp src = get_alu_src(ctx, instr->src[0]);
2445       if (instr->src[0].src.ssa->bit_size == 16) {
2446          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2447          if (dst.type() == RegType::vgpr) {
2448             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2449          } else {
2450             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2451                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2452          }
2453       } else if (instr->src[0].src.ssa->bit_size == 32) {
2454          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2455       } else if (instr->src[0].src.ssa->bit_size == 64) {
2456          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2457       } else {
2458          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2459          nir_print_instr(&instr->instr, stderr);
2460          fprintf(stderr, "\n");
2461       }
2462       break;
2463    }
2464    case nir_op_f2u32: {
2465       Temp src = get_alu_src(ctx, instr->src[0]);
2466       if (instr->src[0].src.ssa->bit_size == 16) {
2467          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2468          if (dst.type() == RegType::vgpr) {
2469             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2470          } else {
2471             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2472                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2473          }
2474       } else if (instr->src[0].src.ssa->bit_size == 32) {
2475          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2476       } else if (instr->src[0].src.ssa->bit_size == 64) {
2477          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2478       } else {
2479          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2480          nir_print_instr(&instr->instr, stderr);
2481          fprintf(stderr, "\n");
2482       }
2483       break;
2484    }
2485    case nir_op_f2i64: {
2486       Temp src = get_alu_src(ctx, instr->src[0]);
2487       if (instr->src[0].src.ssa->bit_size == 16)
2488          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2489
2490       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2491          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2492          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
2493          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2494          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2495          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2496          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
2497          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2498          Temp new_exponent = bld.tmp(v1);
2499          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
2500          if (ctx->program->chip_class >= GFX8)
2501             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2502          else
2503             mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2504          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
2505          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2506          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2507          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
2508          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2509          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2510          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2511          Temp new_lower = bld.tmp(v1);
2512          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2513          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2514          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2515
2516       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2517          if (src.type() == RegType::vgpr)
2518             src = bld.as_uniform(src);
2519          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2520          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2521          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2522          exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
2523          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2524          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2525          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2526          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
2527          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2528          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
2529          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2530          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
2531          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
2532          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2533          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2534          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2535          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2536          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2537          Temp borrow = bld.tmp(s1);
2538          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2539          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
2540          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2541
2542       } else if (instr->src[0].src.ssa->bit_size == 64) {
2543          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2544          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2545          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2546          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2547          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2548          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2549          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2550          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2551          if (dst.type() == RegType::sgpr) {
2552             lower = bld.as_uniform(lower);
2553             upper = bld.as_uniform(upper);
2554          }
2555          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2556
2557       } else {
2558          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2559          nir_print_instr(&instr->instr, stderr);
2560          fprintf(stderr, "\n");
2561       }
2562       break;
2563    }
2564    case nir_op_f2u64: {
2565       Temp src = get_alu_src(ctx, instr->src[0]);
2566       if (instr->src[0].src.ssa->bit_size == 16)
2567          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2568
2569       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2570          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2571          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
2572          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
2573          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2574          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2575          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
2576          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2577          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2578          Temp new_exponent = bld.tmp(v1);
2579          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
2580          if (ctx->program->chip_class >= GFX8)
2581             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2582          else
2583             mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2584          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2585          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2586          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2587          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
2588          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
2589          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
2590          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2591
2592       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2593          if (src.type() == RegType::vgpr)
2594             src = bld.as_uniform(src);
2595          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2596          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2597          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2598          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2599          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2600          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2601          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2602          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2603          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2604          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2605          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2606          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2607          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2608          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2609          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2610          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2611          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2612          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2613
2614       } else if (instr->src[0].src.ssa->bit_size == 64) {
2615          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2616          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2617          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2618          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2619          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2620          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2621          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2622          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2623          if (dst.type() == RegType::sgpr) {
2624             lower = bld.as_uniform(lower);
2625             upper = bld.as_uniform(upper);
2626          }
2627          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2628
2629       } else {
2630          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2631          nir_print_instr(&instr->instr, stderr);
2632          fprintf(stderr, "\n");
2633       }
2634       break;
2635    }
2636    case nir_op_b2f16: {
2637       Temp src = get_alu_src(ctx, instr->src[0]);
2638       assert(src.regClass() == bld.lm);
2639
2640       if (dst.regClass() == s1) {
2641          src = bool_to_scalar_condition(ctx, src);
2642          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
2643       } else if (dst.regClass() == v2b) {
2644          Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2645          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
2646       } else {
2647          unreachable("Wrong destination register class for nir_op_b2f16.");
2648       }
2649       break;
2650    }
2651    case nir_op_b2f32: {
2652       Temp src = get_alu_src(ctx, instr->src[0]);
2653       assert(src.regClass() == bld.lm);
2654
2655       if (dst.regClass() == s1) {
2656          src = bool_to_scalar_condition(ctx, src);
2657          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2658       } else if (dst.regClass() == v1) {
2659          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2660       } else {
2661          unreachable("Wrong destination register class for nir_op_b2f32.");
2662       }
2663       break;
2664    }
2665    case nir_op_b2f64: {
2666       Temp src = get_alu_src(ctx, instr->src[0]);
2667       assert(src.regClass() == bld.lm);
2668
2669       if (dst.regClass() == s2) {
2670          src = bool_to_scalar_condition(ctx, src);
2671          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2672       } else if (dst.regClass() == v2) {
2673          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2674          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2675          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2676       } else {
2677          unreachable("Wrong destination register class for nir_op_b2f64.");
2678       }
2679       break;
2680    }
2681    case nir_op_i2i8:
2682    case nir_op_i2i16:
2683    case nir_op_i2i32:
2684    case nir_op_i2i64: {
2685       convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2686                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
2687       break;
2688    }
2689    case nir_op_u2u8:
2690    case nir_op_u2u16:
2691    case nir_op_u2u32:
2692    case nir_op_u2u64: {
2693       convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2694                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
2695       break;
2696    }
2697    case nir_op_b2b32:
2698    case nir_op_b2i8:
2699    case nir_op_b2i16:
2700    case nir_op_b2i32:
2701    case nir_op_b2i64: {
2702       Temp src = get_alu_src(ctx, instr->src[0]);
2703       assert(src.regClass() == bld.lm);
2704
2705       Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
2706       if (tmp.regClass() == s1) {
2707          // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2708          bool_to_scalar_condition(ctx, src, tmp);
2709       } else if (tmp.type() == RegType::vgpr) {
2710          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
2711       } else {
2712          unreachable("Invalid register class for b2i32");
2713       }
2714
2715       if (tmp != dst)
2716          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
2717       break;
2718    }
2719    case nir_op_b2b1:
2720    case nir_op_i2b1: {
2721       Temp src = get_alu_src(ctx, instr->src[0]);
2722       assert(dst.regClass() == bld.lm);
2723
2724       if (src.type() == RegType::vgpr) {
2725          assert(src.regClass() == v1 || src.regClass() == v2);
2726          assert(dst.regClass() == bld.lm);
2727          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2728                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2729       } else {
2730          assert(src.regClass() == s1 || src.regClass() == s2);
2731          Temp tmp;
2732          if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2733             tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2734          } else {
2735             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2736                            bld.scc(bld.def(s1)), Operand(0u), src);
2737          }
2738          bool_to_vector_condition(ctx, tmp, dst);
2739       }
2740       break;
2741    }
2742    case nir_op_pack_64_2x32_split: {
2743       Temp src0 = get_alu_src(ctx, instr->src[0]);
2744       Temp src1 = get_alu_src(ctx, instr->src[1]);
2745
2746       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2747       break;
2748    }
2749    case nir_op_unpack_64_2x32_split_x:
2750       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2751       break;
2752    case nir_op_unpack_64_2x32_split_y:
2753       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2754       break;
2755    case nir_op_unpack_32_2x16_split_x:
2756       if (dst.type() == RegType::vgpr) {
2757          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2758       } else {
2759          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
2760       }
2761       break;
2762    case nir_op_unpack_32_2x16_split_y:
2763       if (dst.type() == RegType::vgpr) {
2764          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2765       } else {
2766          bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16)));
2767       }
2768       break;
2769    case nir_op_pack_32_2x16_split: {
2770       Temp src0 = get_alu_src(ctx, instr->src[0]);
2771       Temp src1 = get_alu_src(ctx, instr->src[1]);
2772       if (dst.regClass() == v1) {
2773          src0 = emit_extract_vector(ctx, src0, 0, v2b);
2774          src1 = emit_extract_vector(ctx, src1, 0, v2b);
2775          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2776       } else {
2777          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
2778          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
2779          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
2780       }
2781       break;
2782    }
2783    case nir_op_pack_half_2x16: {
2784       Temp src = get_alu_src(ctx, instr->src[0], 2);
2785
2786       if (dst.regClass() == v1) {
2787          Temp src0 = bld.tmp(v1);
2788          Temp src1 = bld.tmp(v1);
2789          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2790          if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2791             bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2792          else
2793             bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2794                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2795                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2796       } else {
2797          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2798          nir_print_instr(&instr->instr, stderr);
2799          fprintf(stderr, "\n");
2800       }
2801       break;
2802    }
2803    case nir_op_unpack_half_2x16_split_x: {
2804       if (dst.regClass() == v1) {
2805          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2806       } else {
2807          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2808          nir_print_instr(&instr->instr, stderr);
2809          fprintf(stderr, "\n");
2810       }
2811       break;
2812    }
2813    case nir_op_unpack_half_2x16_split_y: {
2814       if (dst.regClass() == v1) {
2815          /* TODO: use SDWA here */
2816          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2817                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2818       } else {
2819          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2820          nir_print_instr(&instr->instr, stderr);
2821          fprintf(stderr, "\n");
2822       }
2823       break;
2824    }
2825    case nir_op_fquantize2f16: {
2826       Temp src = get_alu_src(ctx, instr->src[0]);
2827       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2828       Temp f32, cmp_res;
2829
2830       if (ctx->program->chip_class >= GFX8) {
2831          Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2832          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2833          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2834       } else {
2835          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2836           * so compare the result and flush to 0 if it's smaller.
2837           */
2838          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2839          Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2840          Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
2841          static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2842          cmp_res = vop3->definitions[0].getTemp();
2843       }
2844
2845       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2846          Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2847          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2848       } else {
2849          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2850       }
2851       break;
2852    }
2853    case nir_op_bfm: {
2854       Temp bits = get_alu_src(ctx, instr->src[0]);
2855       Temp offset = get_alu_src(ctx, instr->src[1]);
2856
2857       if (dst.regClass() == s1) {
2858          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2859       } else if (dst.regClass() == v1) {
2860          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2861       } else {
2862          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2863          nir_print_instr(&instr->instr, stderr);
2864          fprintf(stderr, "\n");
2865       }
2866       break;
2867    }
2868    case nir_op_bitfield_select: {
2869       /* (mask & insert) | (~mask & base) */
2870       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2871       Temp insert = get_alu_src(ctx, instr->src[1]);
2872       Temp base = get_alu_src(ctx, instr->src[2]);
2873
2874       /* dst = (insert & bitmask) | (base & ~bitmask) */
2875       if (dst.regClass() == s1) {
2876          aco_ptr<Instruction> sop2;
2877          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2878          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2879          Operand lhs;
2880          if (const_insert && const_bitmask) {
2881             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2882          } else {
2883             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2884             lhs = Operand(insert);
2885          }
2886
2887          Operand rhs;
2888          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2889          if (const_base && const_bitmask) {
2890             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2891          } else {
2892             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2893             rhs = Operand(base);
2894          }
2895
2896          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2897
2898       } else if (dst.regClass() == v1) {
2899          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2900             base = as_vgpr(ctx, base);
2901          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2902             insert = as_vgpr(ctx, insert);
2903
2904          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2905
2906       } else {
2907          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2908          nir_print_instr(&instr->instr, stderr);
2909          fprintf(stderr, "\n");
2910       }
2911       break;
2912    }
2913    case nir_op_ubfe:
2914    case nir_op_ibfe: {
2915       Temp base = get_alu_src(ctx, instr->src[0]);
2916       Temp offset = get_alu_src(ctx, instr->src[1]);
2917       Temp bits = get_alu_src(ctx, instr->src[2]);
2918
2919       if (dst.type() == RegType::sgpr) {
2920          Operand extract;
2921          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2922          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2923          if (const_offset && const_bits) {
2924             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2925             extract = Operand(const_extract);
2926          } else {
2927             Operand width;
2928             if (const_bits) {
2929                width = Operand(const_bits->u32 << 16);
2930             } else {
2931                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2932             }
2933             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2934          }
2935
2936          aco_opcode opcode;
2937          if (dst.regClass() == s1) {
2938             if (instr->op == nir_op_ubfe)
2939                opcode = aco_opcode::s_bfe_u32;
2940             else
2941                opcode = aco_opcode::s_bfe_i32;
2942          } else if (dst.regClass() == s2) {
2943             if (instr->op == nir_op_ubfe)
2944                opcode = aco_opcode::s_bfe_u64;
2945             else
2946                opcode = aco_opcode::s_bfe_i64;
2947          } else {
2948             unreachable("Unsupported BFE bit size");
2949          }
2950
2951          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2952
2953       } else {
2954          aco_opcode opcode;
2955          if (dst.regClass() == v1) {
2956             if (instr->op == nir_op_ubfe)
2957                opcode = aco_opcode::v_bfe_u32;
2958             else
2959                opcode = aco_opcode::v_bfe_i32;
2960          } else {
2961             unreachable("Unsupported BFE bit size");
2962          }
2963
2964          emit_vop3a_instruction(ctx, instr, opcode, dst);
2965       }
2966       break;
2967    }
2968    case nir_op_bit_count: {
2969       Temp src = get_alu_src(ctx, instr->src[0]);
2970       if (src.regClass() == s1) {
2971          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2972       } else if (src.regClass() == v1) {
2973          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2974       } else if (src.regClass() == v2) {
2975          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2976                   emit_extract_vector(ctx, src, 1, v1),
2977                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2978                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2979       } else if (src.regClass() == s2) {
2980          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2981       } else {
2982          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2983          nir_print_instr(&instr->instr, stderr);
2984          fprintf(stderr, "\n");
2985       }
2986       break;
2987    }
2988    case nir_op_flt: {
2989       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2990       break;
2991    }
2992    case nir_op_fge: {
2993       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2994       break;
2995    }
2996    case nir_op_feq: {
2997       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2998       break;
2999    }
3000    case nir_op_fne: {
3001       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
3002       break;
3003    }
3004    case nir_op_ilt: {
3005       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3006       break;
3007    }
3008    case nir_op_ige: {
3009       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3010       break;
3011    }
3012    case nir_op_ieq: {
3013       if (instr->src[0].src.ssa->bit_size == 1)
3014          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3015       else
3016          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3017                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3018       break;
3019    }
3020    case nir_op_ine: {
3021       if (instr->src[0].src.ssa->bit_size == 1)
3022          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3023       else
3024          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3025                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3026       break;
3027    }
3028    case nir_op_ult: {
3029       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3030       break;
3031    }
3032    case nir_op_uge: {
3033       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3034       break;
3035    }
3036    case nir_op_fddx:
3037    case nir_op_fddy:
3038    case nir_op_fddx_fine:
3039    case nir_op_fddy_fine:
3040    case nir_op_fddx_coarse:
3041    case nir_op_fddy_coarse: {
3042       Temp src = get_alu_src(ctx, instr->src[0]);
3043       uint16_t dpp_ctrl1, dpp_ctrl2;
3044       if (instr->op == nir_op_fddx_fine) {
3045          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3046          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3047       } else if (instr->op == nir_op_fddy_fine) {
3048          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3049          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3050       } else {
3051          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3052          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3053             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3054          else
3055             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3056       }
3057
3058       Temp tmp;
3059       if (ctx->program->chip_class >= GFX8) {
3060          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3061          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3062       } else {
3063          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3064          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3065          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3066       }
3067       emit_wqm(ctx, tmp, dst, true);
3068       break;
3069    }
3070    default:
3071       fprintf(stderr, "Unknown NIR ALU instr: ");
3072       nir_print_instr(&instr->instr, stderr);
3073       fprintf(stderr, "\n");
3074    }
3075 }
3076
3077 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
3078 {
3079    Temp dst = get_ssa_temp(ctx, &instr->def);
3080
3081    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3082    // which get truncated the lsb if double and msb if int
3083    // for now, we only use s_mov_b64 with 64bit inline constants
3084    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3085    assert(dst.type() == RegType::sgpr);
3086
3087    Builder bld(ctx->program, ctx->block);
3088
3089    if (instr->def.bit_size == 1) {
3090       assert(dst.regClass() == bld.lm);
3091       int val = instr->value[0].b ? -1 : 0;
3092       Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
3093       bld.sop1(Builder::s_mov, Definition(dst), op);
3094    } else if (instr->def.bit_size == 8) {
3095       /* ensure that the value is correctly represented in the low byte of the register */
3096       bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8);
3097    } else if (instr->def.bit_size == 16) {
3098       /* ensure that the value is correctly represented in the low half of the register */
3099       bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16);
3100    } else if (dst.size() == 1) {
3101       bld.copy(Definition(dst), Operand(instr->value[0].u32));
3102    } else {
3103       assert(dst.size() != 1);
3104       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3105       if (instr->def.bit_size == 64)
3106          for (unsigned i = 0; i < dst.size(); i++)
3107             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
3108       else {
3109          for (unsigned i = 0; i < dst.size(); i++)
3110             vec->operands[i] = Operand{instr->value[i].u32};
3111       }
3112       vec->definitions[0] = Definition(dst);
3113       ctx->block->instructions.emplace_back(std::move(vec));
3114    }
3115 }
3116
3117 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
3118 {
3119    uint32_t new_mask = 0;
3120    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3121       if (mask & (1u << i))
3122          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3123    return new_mask;
3124 }
3125
3126 struct LoadEmitInfo {
3127    Operand offset;
3128    Temp dst;
3129    unsigned num_components;
3130    unsigned component_size;
3131    Temp resource = Temp(0, s1);
3132    unsigned component_stride = 0;
3133    unsigned const_offset = 0;
3134    unsigned align_mul = 0;
3135    unsigned align_offset = 0;
3136
3137    bool glc = false;
3138    unsigned swizzle_component_size = 0;
3139    barrier_interaction barrier = barrier_none;
3140    bool can_reorder = true;
3141    Temp soffset = Temp(0, s1);
3142 };
3143
3144 using LoadCallback = Temp(*)(
3145    Builder& bld, const LoadEmitInfo* info, Temp offset, unsigned bytes_needed,
3146    unsigned align, unsigned const_offset, Temp dst_hint);
3147
3148 template <LoadCallback callback, bool byte_align_loads, bool supports_8bit_16bit_loads, unsigned max_const_offset_plus_one>
3149 void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
3150 {
3151    unsigned load_size = info->num_components * info->component_size;
3152    unsigned component_size = info->component_size;
3153
3154    unsigned num_vals = 0;
3155    Temp vals[info->dst.bytes()];
3156
3157    unsigned const_offset = info->const_offset;
3158
3159    unsigned align_mul = info->align_mul ? info->align_mul : component_size;
3160    unsigned align_offset = (info->align_offset + const_offset) % align_mul;
3161
3162    unsigned bytes_read = 0;
3163    while (bytes_read < load_size) {
3164       unsigned bytes_needed = load_size - bytes_read;
3165
3166       /* add buffer for unaligned loads */
3167       int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3168
3169       if (byte_align) {
3170          if ((bytes_needed > 2 ||
3171               (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3172               !supports_8bit_16bit_loads) && byte_align_loads) {
3173             if (info->component_stride) {
3174                assert(supports_8bit_16bit_loads && "unimplemented");
3175                bytes_needed = 2;
3176                byte_align = 0;
3177             } else {
3178                bytes_needed += byte_align == -1 ? 4 - info->align_mul : byte_align;
3179                bytes_needed = align(bytes_needed, 4);
3180             }
3181          } else {
3182             byte_align = 0;
3183          }
3184       }
3185
3186       if (info->swizzle_component_size)
3187          bytes_needed = MIN2(bytes_needed, info->swizzle_component_size);
3188       if (info->component_stride)
3189          bytes_needed = MIN2(bytes_needed, info->component_size);
3190
3191       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3192
3193       /* reduce constant offset */
3194       Operand offset = info->offset;
3195       unsigned reduced_const_offset = const_offset;
3196       bool remove_const_offset_completely = need_to_align_offset;
3197       if (const_offset && (remove_const_offset_completely || const_offset >= max_const_offset_plus_one)) {
3198          unsigned to_add = const_offset;
3199          if (remove_const_offset_completely) {
3200             reduced_const_offset = 0;
3201          } else {
3202             to_add = const_offset / max_const_offset_plus_one * max_const_offset_plus_one;
3203             reduced_const_offset %= max_const_offset_plus_one;
3204          }
3205          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3206          if (offset.isConstant()) {
3207             offset = Operand(offset.constantValue() + to_add);
3208          } else if (offset_tmp.regClass() == s1) {
3209             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3210                               offset_tmp, Operand(to_add));
3211          } else if (offset_tmp.regClass() == v1) {
3212             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add));
3213          } else {
3214             Temp lo = bld.tmp(offset_tmp.type(), 1);
3215             Temp hi = bld.tmp(offset_tmp.type(), 1);
3216             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3217
3218             if (offset_tmp.regClass() == s2) {
3219                Temp carry = bld.tmp(s1);
3220                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add));
3221                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3222                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3223             } else {
3224                Temp new_lo = bld.tmp(v1);
3225                Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp();
3226                hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry);
3227                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3228             }
3229          }
3230       }
3231
3232       /* align offset down if needed */
3233       Operand aligned_offset = offset;
3234       if (need_to_align_offset) {
3235          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3236          if (offset.isConstant()) {
3237             aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
3238          } else if (offset_tmp.regClass() == s1) {
3239             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp);
3240          } else if (offset_tmp.regClass() == s2) {
3241             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp);
3242          } else if (offset_tmp.regClass() == v1) {
3243             aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp);
3244          } else if (offset_tmp.regClass() == v2) {
3245             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3246             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3247             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), lo);
3248             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3249          }
3250       }
3251       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
3252                                 bld.copy(bld.def(s1), aligned_offset);
3253
3254       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3255       Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3256                           reduced_const_offset, byte_align ? Temp() : info->dst);
3257
3258       /* the callback wrote directly to dst */
3259       if (val == info->dst) {
3260          assert(num_vals == 0);
3261          emit_split_vector(ctx, info->dst, info->num_components);
3262          return;
3263       }
3264
3265       /* shift result right if needed */
3266       if (info->component_size < 4 && byte_align_loads) {
3267          Operand align((uint32_t)byte_align);
3268          if (byte_align == -1) {
3269             if (offset.isConstant())
3270                align = Operand(offset.constantValue() % 4u);
3271             else if (offset.size() == 2)
3272                align = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1)));
3273             else
3274                align = offset;
3275          }
3276
3277          assert(val.bytes() >= load_size && "unimplemented");
3278          if (val.type() == RegType::sgpr)
3279             byte_align_scalar(ctx, val, align, info->dst);
3280          else
3281             byte_align_vector(ctx, val, align, info->dst, component_size);
3282          return;
3283       }
3284
3285       /* add result to list and advance */
3286       if (info->component_stride) {
3287          assert(val.bytes() == info->component_size && "unimplemented");
3288          const_offset += info->component_stride;
3289          align_offset = (align_offset + info->component_stride) % align_mul;
3290       } else {
3291          const_offset += val.bytes();
3292          align_offset = (align_offset + val.bytes()) % align_mul;
3293       }
3294       bytes_read += val.bytes();
3295       vals[num_vals++] = val;
3296    }
3297
3298    /* create array of components */
3299    unsigned components_split = 0;
3300    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3301    bool has_vgprs = false;
3302    for (unsigned i = 0; i < num_vals;) {
3303       Temp tmp[num_vals];
3304       unsigned num_tmps = 0;
3305       unsigned tmp_size = 0;
3306       RegType reg_type = RegType::sgpr;
3307       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3308          if (vals[i].type() == RegType::vgpr)
3309             reg_type = RegType::vgpr;
3310          tmp_size += vals[i].bytes();
3311          tmp[num_tmps++] = vals[i++];
3312       }
3313       if (num_tmps > 1) {
3314          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3315             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3316          for (unsigned i = 0; i < num_vals; i++)
3317             vec->operands[i] = Operand(tmp[i]);
3318          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3319          vec->definitions[0] = Definition(tmp[0]);
3320          bld.insert(std::move(vec));
3321       }
3322
3323       if (tmp[0].bytes() % component_size) {
3324          /* trim tmp[0] */
3325          assert(i == num_vals);
3326          RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3327          tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u));
3328       }
3329
3330       RegClass elem_rc = RegClass::get(reg_type, component_size);
3331
3332       unsigned start = components_split;
3333
3334       if (tmp_size == elem_rc.bytes()) {
3335          allocated_vec[components_split++] = tmp[0];
3336       } else {
3337          assert(tmp_size % elem_rc.bytes() == 0);
3338          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3339             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3340          for (unsigned i = 0; i < split->definitions.size(); i++) {
3341             Temp component = bld.tmp(elem_rc);
3342             allocated_vec[components_split++] = component;
3343             split->definitions[i] = Definition(component);
3344          }
3345          split->operands[0] = Operand(tmp[0]);
3346          bld.insert(std::move(split));
3347       }
3348
3349       /* try to p_as_uniform early so we can create more optimizable code and
3350        * also update allocated_vec */
3351       for (unsigned j = start; j < components_split; j++) {
3352          if (allocated_vec[j].bytes() % 4 == 0 && info->dst.type() == RegType::sgpr)
3353             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3354          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3355       }
3356    }
3357
3358    /* concatenate components and p_as_uniform() result if needed */
3359    if (info->dst.type() == RegType::vgpr || !has_vgprs)
3360       ctx->allocated_vec.emplace(info->dst.id(), allocated_vec);
3361
3362    int padding_bytes = MAX2((int)info->dst.bytes() - int(allocated_vec[0].bytes() * info->num_components), 0);
3363
3364    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3365       aco_opcode::p_create_vector, Format::PSEUDO, info->num_components + !!padding_bytes, 1)};
3366    for (unsigned i = 0; i < info->num_components; i++)
3367       vec->operands[i] = Operand(allocated_vec[i]);
3368    if (padding_bytes)
3369       vec->operands[info->num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3370    if (info->dst.type() == RegType::sgpr && has_vgprs) {
3371       Temp tmp = bld.tmp(RegType::vgpr, info->dst.size());
3372       vec->definitions[0] = Definition(tmp);
3373       bld.insert(std::move(vec));
3374       bld.pseudo(aco_opcode::p_as_uniform, Definition(info->dst), tmp);
3375    } else {
3376       vec->definitions[0] = Definition(info->dst);
3377       bld.insert(std::move(vec));
3378    }
3379 }
3380
3381 Operand load_lds_size_m0(Builder& bld)
3382 {
3383    /* TODO: m0 does not need to be initialized on GFX9+ */
3384    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
3385 }
3386
3387 Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
3388                        Temp offset, unsigned bytes_needed,
3389                        unsigned align, unsigned const_offset,
3390                        Temp dst_hint)
3391 {
3392    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3393
3394    Operand m = load_lds_size_m0(bld);
3395
3396    bool large_ds_read = bld.program->chip_class >= GFX7;
3397    bool usable_read2 = bld.program->chip_class >= GFX7;
3398
3399    bool read2 = false;
3400    unsigned size = 0;
3401    aco_opcode op;
3402    //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
3403    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3404       size = 16;
3405       op = aco_opcode::ds_read_b128;
3406    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3407       size = 16;
3408       read2 = true;
3409       op = aco_opcode::ds_read2_b64;
3410    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3411       size = 12;
3412       op = aco_opcode::ds_read_b96;
3413    } else if (bytes_needed >= 8 && align % 8 == 0) {
3414       size = 8;
3415       op = aco_opcode::ds_read_b64;
3416    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
3417       size = 8;
3418       read2 = true;
3419       op = aco_opcode::ds_read2_b32;
3420    } else if (bytes_needed >= 4 && align % 4 == 0) {
3421       size = 4;
3422       op = aco_opcode::ds_read_b32;
3423    } else if (bytes_needed >= 2 && align % 2 == 0) {
3424       size = 2;
3425       op = aco_opcode::ds_read_u16;
3426    } else {
3427       size = 1;
3428       op = aco_opcode::ds_read_u8;
3429    }
3430
3431    unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536;
3432    if (const_offset >= max_offset_plus_one) {
3433       offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one));
3434       const_offset %= max_offset_plus_one;
3435    }
3436
3437    if (read2)
3438       const_offset /= (size / 2u);
3439
3440    RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
3441    Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3442    if (read2)
3443       bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3444    else
3445       bld.ds(op, Definition(val), offset, m, const_offset);
3446
3447    if (size < 4)
3448       val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
3449
3450    return val;
3451 }
3452
3453 static auto emit_lds_load = emit_load<lds_load_callback, false, true, UINT32_MAX>;
3454
3455 Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
3456                         Temp offset, unsigned bytes_needed,
3457                         unsigned align, unsigned const_offset,
3458                         Temp dst_hint)
3459 {
3460    unsigned size = 0;
3461    aco_opcode op;
3462    if (bytes_needed <= 4) {
3463       size = 1;
3464       op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3465    } else if (bytes_needed <= 8) {
3466       size = 2;
3467       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3468    } else if (bytes_needed <= 16) {
3469       size = 4;
3470       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
3471    } else if (bytes_needed <= 32) {
3472       size = 8;
3473       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
3474    } else {
3475       size = 16;
3476       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
3477    }
3478    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3479    if (info->resource.id()) {
3480       load->operands[0] = Operand(info->resource);
3481       load->operands[1] = Operand(offset);
3482    } else {
3483       load->operands[0] = Operand(offset);
3484       load->operands[1] = Operand(0u);
3485    }
3486    RegClass rc(RegType::sgpr, size);
3487    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
3488    load->definitions[0] = Definition(val);
3489    load->glc = info->glc;
3490    load->dlc = info->glc && bld.program->chip_class >= GFX10;
3491    load->barrier = info->barrier;
3492    load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
3493    bld.insert(std::move(load));
3494    return val;
3495 }
3496
3497 static auto emit_smem_load = emit_load<smem_load_callback, true, false, 1024>;
3498
3499 Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
3500                          Temp offset, unsigned bytes_needed,
3501                          unsigned align_, unsigned const_offset,
3502                          Temp dst_hint)
3503 {
3504    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3505    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3506
3507    if (info->soffset.id()) {
3508       if (soffset.isTemp())
3509          vaddr = bld.copy(bld.def(v1), soffset);
3510       soffset = Operand(info->soffset);
3511    }
3512
3513    unsigned bytes_size = 0;
3514    aco_opcode op;
3515    if (bytes_needed == 1) {
3516       bytes_size = 1;
3517       op = aco_opcode::buffer_load_ubyte;
3518    } else if (bytes_needed == 2) {
3519       bytes_size = 2;
3520       op = aco_opcode::buffer_load_ushort;
3521    } else if (bytes_needed <= 4) {
3522       bytes_size = 4;
3523       op = aco_opcode::buffer_load_dword;
3524    } else if (bytes_needed <= 8) {
3525       bytes_size = 8;
3526       op = aco_opcode::buffer_load_dwordx2;
3527    } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
3528       bytes_size = 12;
3529       op = aco_opcode::buffer_load_dwordx3;
3530    } else {
3531       bytes_size = 16;
3532       op = aco_opcode::buffer_load_dwordx4;
3533    }
3534    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3535    mubuf->operands[0] = Operand(info->resource);
3536    mubuf->operands[1] = vaddr;
3537    mubuf->operands[2] = soffset;
3538    mubuf->offen = (offset.type() == RegType::vgpr);
3539    mubuf->glc = info->glc;
3540    mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
3541    mubuf->barrier = info->barrier;
3542    mubuf->can_reorder = info->can_reorder;
3543    mubuf->offset = const_offset;
3544    mubuf->swizzled = info->swizzle_component_size != 0;
3545    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3546    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3547    mubuf->definitions[0] = Definition(val);
3548    bld.insert(std::move(mubuf));
3549
3550    return val;
3551 }
3552
3553 static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
3554
3555 Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
3556 {
3557    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3558                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3559
3560    if (addr.type() == RegType::vgpr)
3561       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
3562    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
3563 }
3564
3565 Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
3566                           Temp offset, unsigned bytes_needed,
3567                           unsigned align_, unsigned const_offset,
3568                           Temp dst_hint)
3569 {
3570    unsigned bytes_size = 0;
3571    bool mubuf = bld.program->chip_class == GFX6;
3572    bool global = bld.program->chip_class >= GFX9;
3573    aco_opcode op;
3574    if (bytes_needed == 1) {
3575       bytes_size = 1;
3576       op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
3577    } else if (bytes_needed == 2) {
3578       bytes_size = 2;
3579       op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
3580    } else if (bytes_needed <= 4) {
3581       bytes_size = 4;
3582       op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
3583    } else if (bytes_needed <= 8) {
3584       bytes_size = 8;
3585       op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
3586    } else if (bytes_needed <= 12 && !mubuf) {
3587       bytes_size = 12;
3588       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
3589    } else {
3590       bytes_size = 16;
3591       op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
3592    }
3593    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3594    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3595    if (mubuf) {
3596       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3597       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
3598       mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3599       mubuf->operands[2] = Operand(0u);
3600       mubuf->glc = info->glc;
3601       mubuf->dlc = false;
3602       mubuf->offset = 0;
3603       mubuf->addr64 = offset.type() == RegType::vgpr;
3604       mubuf->disable_wqm = false;
3605       mubuf->barrier = info->barrier;
3606       mubuf->definitions[0] = Definition(val);
3607       bld.insert(std::move(mubuf));
3608    } else {
3609       offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
3610
3611       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
3612       flat->operands[0] = Operand(offset);
3613       flat->operands[1] = Operand(s1);
3614       flat->glc = info->glc;
3615       flat->dlc = info->glc && bld.program->chip_class >= GFX10;
3616       flat->barrier = info->barrier;
3617       flat->offset = 0u;
3618       flat->definitions[0] = Definition(val);
3619       bld.insert(std::move(flat));
3620    }
3621
3622    return val;
3623 }
3624
3625 static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
3626
3627 Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
3628               Temp address, unsigned base_offset, unsigned align)
3629 {
3630    assert(util_is_power_of_two_nonzero(align));
3631
3632    Builder bld(ctx->program, ctx->block);
3633
3634    unsigned num_components = dst.bytes() / elem_size_bytes;
3635    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
3636    info.align_mul = align;
3637    info.align_offset = 0;
3638    info.barrier = barrier_shared;
3639    info.can_reorder = false;
3640    info.const_offset = base_offset;
3641    emit_lds_load(ctx, bld, &info);
3642
3643    return dst;
3644 }
3645
3646 void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
3647 {
3648    if (!count)
3649       return;
3650
3651    Builder bld(ctx->program, ctx->block);
3652
3653    ASSERTED bool is_subdword = false;
3654    for (unsigned i = 0; i < count; i++)
3655       is_subdword |= offsets[i] % 4;
3656    is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
3657    assert(!is_subdword || dst_type == RegType::vgpr);
3658
3659    /* count == 1 fast path */
3660    if (count == 1) {
3661       if (dst_type == RegType::sgpr)
3662          dst[0] = bld.as_uniform(src);
3663       else
3664          dst[0] = as_vgpr(ctx, src);
3665       return;
3666    }
3667
3668    for (unsigned i = 0; i < count - 1; i++)
3669       dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
3670    dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
3671
3672    if (is_subdword && src.type() == RegType::sgpr) {
3673       src = as_vgpr(ctx, src);
3674    } else {
3675       /* use allocated_vec if possible */
3676       auto it = ctx->allocated_vec.find(src.id());
3677       if (it != ctx->allocated_vec.end()) {
3678          unsigned total_size = 0;
3679          for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
3680             total_size += it->second[i].bytes();
3681          if (total_size != src.bytes())
3682             goto split;
3683
3684          unsigned elem_size = it->second[0].bytes();
3685
3686          for (unsigned i = 0; i < count; i++) {
3687             if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
3688                goto split;
3689          }
3690
3691          for (unsigned i = 0; i < count; i++) {
3692             unsigned start_idx = offsets[i] / elem_size;
3693             unsigned op_count = dst[i].bytes() / elem_size;
3694             if (op_count == 1) {
3695                if (dst_type == RegType::sgpr)
3696                   dst[i] = bld.as_uniform(it->second[start_idx]);
3697                else
3698                   dst[i] = as_vgpr(ctx, it->second[start_idx]);
3699                continue;
3700             }
3701
3702             aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
3703             for (unsigned j = 0; j < op_count; j++) {
3704                Temp tmp = it->second[start_idx + j];
3705                if (dst_type == RegType::sgpr)
3706                   tmp = bld.as_uniform(tmp);
3707                vec->operands[j] = Operand(tmp);
3708             }
3709             vec->definitions[0] = Definition(dst[i]);
3710             bld.insert(std::move(vec));
3711          }
3712          return;
3713       }
3714    }
3715
3716    if (dst_type == RegType::sgpr)
3717       src = bld.as_uniform(src);
3718
3719    split:
3720    /* just split it */
3721    aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
3722    split->operands[0] = Operand(src);
3723    for (unsigned i = 0; i < count; i++)
3724       split->definitions[i] = Definition(dst[i]);
3725    bld.insert(std::move(split));
3726 }
3727
3728 bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
3729                      int *start, int *count)
3730 {
3731    unsigned start_elem = ffs(todo_mask) - 1;
3732    bool skip = !(mask & (1 << start_elem));
3733    if (skip)
3734       mask = ~mask & todo_mask;
3735
3736    mask &= todo_mask;
3737
3738    u_bit_scan_consecutive_range(&mask, start, count);
3739
3740    return !skip;
3741 }
3742
3743 void advance_write_mask(uint32_t *todo_mask, int start, int count)
3744 {
3745    *todo_mask &= ~u_bit_consecutive(0, count) << start;
3746 }
3747
3748 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
3749                Temp address, unsigned base_offset, unsigned align)
3750 {
3751    assert(util_is_power_of_two_nonzero(align));
3752    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
3753
3754    Builder bld(ctx->program, ctx->block);
3755    bool large_ds_write = ctx->options->chip_class >= GFX7;
3756    bool usable_write2 = ctx->options->chip_class >= GFX7;
3757
3758    unsigned write_count = 0;
3759    Temp write_datas[32];
3760    unsigned offsets[32];
3761    aco_opcode opcodes[32];
3762
3763    wrmask = widen_mask(wrmask, elem_size_bytes);
3764
3765    uint32_t todo = u_bit_consecutive(0, data.bytes());
3766    while (todo) {
3767       int offset, bytes;
3768       if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
3769          offsets[write_count] = offset;
3770          opcodes[write_count] = aco_opcode::num_opcodes;
3771          write_count++;
3772          advance_write_mask(&todo, offset, bytes);
3773          continue;
3774       }
3775
3776       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
3777       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
3778       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
3779       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
3780
3781       //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
3782       aco_opcode op = aco_opcode::num_opcodes;
3783       if (bytes >= 16 && aligned16 && large_ds_write) {
3784          op = aco_opcode::ds_write_b128;
3785          bytes = 16;
3786       } else if (bytes >= 12 && aligned16 && large_ds_write) {
3787          op = aco_opcode::ds_write_b96;
3788          bytes = 12;
3789       } else if (bytes >= 8 && aligned8) {
3790          op = aco_opcode::ds_write_b64;
3791          bytes = 8;
3792       } else if (bytes >= 4 && aligned4) {
3793          op = aco_opcode::ds_write_b32;
3794          bytes = 4;
3795       } else if (bytes >= 2 && aligned2) {
3796          op = aco_opcode::ds_write_b16;
3797          bytes = 2;
3798       } else if (bytes >= 1) {
3799          op = aco_opcode::ds_write_b8;
3800          bytes = 1;
3801       } else {
3802          assert(false);
3803       }
3804
3805       offsets[write_count] = offset;
3806       opcodes[write_count] = op;
3807       write_count++;
3808       advance_write_mask(&todo, offset, bytes);
3809    }
3810
3811    Operand m = load_lds_size_m0(bld);
3812
3813    split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
3814
3815    for (unsigned i = 0; i < write_count; i++) {
3816       aco_opcode op = opcodes[i];
3817       if (op == aco_opcode::num_opcodes)
3818          continue;
3819
3820       Temp data = write_datas[i];
3821
3822       unsigned second = write_count;
3823       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
3824          for (second = i + 1; second < write_count; second++) {
3825             if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
3826                op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
3827                opcodes[second] = aco_opcode::num_opcodes;
3828                break;
3829             }
3830          }
3831       }
3832
3833       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
3834       unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
3835
3836       unsigned inline_offset = base_offset + offsets[i];
3837       unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
3838       Temp address_offset = address;
3839       if (inline_offset > max_offset) {
3840          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
3841          inline_offset = offsets[i];
3842       }
3843       assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
3844
3845       if (write2) {
3846          Temp second_data = write_datas[second];
3847          inline_offset /= data.bytes();
3848          bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
3849       } else {
3850          bld.ds(op, address_offset, data, m, inline_offset);
3851       }
3852    }
3853 }
3854
3855 unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
3856 {
3857    unsigned align = 16;
3858    if (const_offset)
3859       align = std::min(align, 1u << (ffs(const_offset) - 1));
3860
3861    return align;
3862 }
3863
3864
3865 aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
3866 {
3867    switch (bytes) {
3868    case 1:
3869       assert(!smem);
3870       return aco_opcode::buffer_store_byte;
3871    case 2:
3872       assert(!smem);
3873       return aco_opcode::buffer_store_short;
3874    case 4:
3875       return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
3876    case 8:
3877       return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
3878    case 12:
3879       assert(!smem);
3880       return aco_opcode::buffer_store_dwordx3;
3881    case 16:
3882       return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
3883    }
3884    unreachable("Unexpected store size");
3885    return aco_opcode::num_opcodes;
3886 }
3887
3888 void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
3889                         Temp data, unsigned writemask, int swizzle_element_size,
3890                         unsigned *write_count, Temp *write_datas, unsigned *offsets)
3891 {
3892    unsigned write_count_with_skips = 0;
3893    bool skips[16];
3894
3895    /* determine how to split the data */
3896    unsigned todo = u_bit_consecutive(0, data.bytes());
3897    while (todo) {
3898       int offset, bytes;
3899       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
3900       offsets[write_count_with_skips] = offset;
3901       if (skips[write_count_with_skips]) {
3902          advance_write_mask(&todo, offset, bytes);
3903          write_count_with_skips++;
3904          continue;
3905       }
3906
3907       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
3908        * larger than swizzle_element_size */
3909       bytes = MIN2(bytes, swizzle_element_size);
3910       if (bytes % 4)
3911          bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
3912
3913       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
3914       if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
3915          bytes = 8;
3916
3917       /* dword or larger stores have to be dword-aligned */
3918       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
3919       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
3920       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
3921       if (!dword_aligned)
3922          bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
3923
3924       advance_write_mask(&todo, offset, bytes);
3925       write_count_with_skips++;
3926    }
3927
3928    /* actually split data */
3929    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
3930
3931    /* remove skips */
3932    for (unsigned i = 0; i < write_count_with_skips; i++) {
3933       if (skips[i])
3934          continue;
3935       write_datas[*write_count] = write_datas[i];
3936       offsets[*write_count] = offsets[i];
3937       (*write_count)++;
3938    }
3939 }
3940
3941 Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
3942                            unsigned split_cnt = 0u, Temp dst = Temp())
3943 {
3944    Builder bld(ctx->program, ctx->block);
3945    unsigned dword_size = elem_size_bytes / 4;
3946
3947    if (!dst.id())
3948       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
3949
3950    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3951    aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
3952    instr->definitions[0] = Definition(dst);
3953
3954    for (unsigned i = 0; i < cnt; ++i) {
3955       if (arr[i].id()) {
3956          assert(arr[i].size() == dword_size);
3957          allocated_vec[i] = arr[i];
3958          instr->operands[i] = Operand(arr[i]);
3959       } else {
3960          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
3961          allocated_vec[i] = zero;
3962          instr->operands[i] = Operand(zero);
3963       }
3964    }
3965
3966    bld.insert(std::move(instr));
3967
3968    if (split_cnt)
3969       emit_split_vector(ctx, dst, split_cnt);
3970    else
3971       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
3972
3973    return dst;
3974 }
3975
3976 inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
3977 {
3978    if (const_offset >= 4096) {
3979       unsigned excess_const_offset = const_offset / 4096u * 4096u;
3980       const_offset %= 4096u;
3981
3982       if (!voffset.id())
3983          voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
3984       else if (unlikely(voffset.regClass() == s1))
3985          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
3986       else if (likely(voffset.regClass() == v1))
3987          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
3988       else
3989          unreachable("Unsupported register class of voffset");
3990    }
3991
3992    return const_offset;
3993 }
3994
3995 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
3996                              unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
3997                              bool swizzled = false)
3998 {
3999    assert(vdata.id());
4000    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4001    assert(vdata.size() >= 1 && vdata.size() <= 4);
4002
4003    Builder bld(ctx->program, ctx->block);
4004    aco_opcode op = get_buffer_store_op(false, vdata.bytes());
4005    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4006
4007    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4008    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
4009    Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4010                                  /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4011                                  /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4012                                  /* dlc*/ false, /* slc */ slc);
4013
4014    static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
4015 }
4016
4017 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4018                                    unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4019                                    bool allow_combining = true, bool reorder = true, bool slc = false)
4020 {
4021    Builder bld(ctx->program, ctx->block);
4022    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4023    assert(write_mask);
4024    write_mask = widen_mask(write_mask, elem_size_bytes);
4025
4026    unsigned write_count = 0;
4027    Temp write_datas[32];
4028    unsigned offsets[32];
4029    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
4030                       allow_combining ? 16 : 4, &write_count, write_datas, offsets);
4031
4032    for (unsigned i = 0; i < write_count; i++) {
4033       unsigned const_offset = offsets[i] + base_const_offset;
4034       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
4035    }
4036 }
4037
4038 void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4039                      unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4040                      unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
4041 {
4042    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4043    assert((num_components * elem_size_bytes) == dst.bytes());
4044    assert(!!stride != allow_combining);
4045
4046    Builder bld(ctx->program, ctx->block);
4047
4048    LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4049    info.component_stride = allow_combining ? 0 : stride;
4050    info.glc = true;
4051    info.swizzle_component_size = allow_combining ? 0 : 4;
4052    info.align_mul = MIN2(elem_size_bytes, 4);
4053    info.align_offset = 0;
4054    info.soffset = soffset;
4055    info.const_offset = base_const_offset;
4056    emit_mubuf_load(ctx, bld, &info);
4057 }
4058
4059 std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
4060 {
4061    Builder bld(ctx->program, ctx->block);
4062    Temp offset = base_offset.first;
4063    unsigned const_offset = base_offset.second;
4064
4065    if (!nir_src_is_const(*off_src)) {
4066       Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
4067       Temp with_stride;
4068
4069       /* Calculate indirect offset with stride */
4070       if (likely(indirect_offset_arg.regClass() == v1))
4071          with_stride = bld.v_mul24_imm(bld.def(v1), indirect_offset_arg, stride);
4072       else if (indirect_offset_arg.regClass() == s1)
4073          with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
4074       else
4075          unreachable("Unsupported register class of indirect offset");
4076
4077       /* Add to the supplied base offset */
4078       if (offset.id() == 0)
4079          offset = with_stride;
4080       else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
4081          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
4082       else if (offset.size() == 1 && with_stride.size() == 1)
4083          offset = bld.vadd32(bld.def(v1), with_stride, offset);
4084       else
4085          unreachable("Unsupported register class of indirect offset");
4086    } else {
4087       unsigned const_offset_arg = nir_src_as_uint(*off_src);
4088       const_offset += const_offset_arg * stride;
4089    }
4090
4091    return std::make_pair(offset, const_offset);
4092 }
4093
4094 std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
4095 {
4096    Builder bld(ctx->program, ctx->block);
4097    Temp offset;
4098
4099    if (off1.first.id() && off2.first.id()) {
4100       if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
4101          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
4102       else if (off1.first.size() == 1 && off2.first.size() == 1)
4103          offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
4104       else
4105          unreachable("Unsupported register class of indirect offset");
4106    } else {
4107       offset = off1.first.id() ? off1.first : off2.first;
4108    }
4109
4110    return std::make_pair(offset, off1.second + off2.second);
4111 }
4112
4113 std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
4114 {
4115    Builder bld(ctx->program, ctx->block);
4116    unsigned const_offset = offs.second * multiplier;
4117
4118    if (!offs.first.id())
4119       return std::make_pair(offs.first, const_offset);
4120
4121    Temp offset = unlikely(offs.first.regClass() == s1)
4122                  ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
4123                  : bld.v_mul24_imm(bld.def(v1), offs.first, multiplier);
4124
4125    return std::make_pair(offset, const_offset);
4126 }
4127
4128 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
4129 {
4130    Builder bld(ctx->program, ctx->block);
4131
4132    /* base is the driver_location, which is already multiplied by 4, so is in dwords */
4133    unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
4134    /* component is in bytes */
4135    const_offset += nir_intrinsic_component(instr) * component_stride;
4136
4137    /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
4138    nir_src *off_src = nir_get_io_offset_src(instr);
4139    return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
4140 }
4141
4142 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
4143 {
4144    return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
4145 }
4146
4147 Temp get_tess_rel_patch_id(isel_context *ctx)
4148 {
4149    Builder bld(ctx->program, ctx->block);
4150
4151    switch (ctx->shader->info.stage) {
4152    case MESA_SHADER_TESS_CTRL:
4153       return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
4154                       get_arg(ctx, ctx->args->ac.tcs_rel_ids));
4155    case MESA_SHADER_TESS_EVAL:
4156       return get_arg(ctx, ctx->args->tes_rel_patch_id);
4157    default:
4158       unreachable("Unsupported stage in get_tess_rel_patch_id");
4159    }
4160 }
4161
4162 std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4163 {
4164    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4165    Builder bld(ctx->program, ctx->block);
4166
4167    uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
4168    uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
4169
4170    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
4171
4172    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4173    offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
4174
4175    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4176    Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
4177    offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
4178
4179    return offset_mul(ctx, offs, 4u);
4180 }
4181
4182 std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
4183 {
4184    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4185    Builder bld(ctx->program, ctx->block);
4186
4187    uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
4188    uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
4189    uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4190    uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
4191
4192    std::pair<Temp, unsigned> offs = instr
4193                                     ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
4194                                     : std::make_pair(Temp(), 0u);
4195
4196    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4197    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
4198
4199    if (per_vertex) {
4200       assert(instr);
4201
4202       nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4203       offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
4204
4205       uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
4206       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
4207    } else {
4208       uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
4209       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
4210    }
4211
4212    return offs;
4213 }
4214
4215 std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4216 {
4217    Builder bld(ctx->program, ctx->block);
4218
4219    unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
4220    unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
4221
4222    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
4223
4224    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4225    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
4226    offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
4227
4228    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4229    offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
4230
4231    return offs;
4232 }
4233
4234 std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
4235 {
4236    Builder bld(ctx->program, ctx->block);
4237
4238    unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
4239    unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4240    unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
4241    unsigned attr_stride = ctx->tcs_num_patches;
4242
4243    std::pair<Temp, unsigned> offs = instr
4244                                     ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
4245                                     : std::make_pair(Temp(), 0u);
4246
4247    if (const_base_offset)
4248       offs.second += const_base_offset * attr_stride;
4249
4250    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4251    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, 16u);
4252    offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
4253
4254    return offs;
4255 }
4256
4257 bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
4258 {
4259    assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4260
4261    if (mask == 0)
4262       return false;
4263
4264    unsigned drv_loc = nir_intrinsic_base(instr);
4265    nir_src *off_src = nir_get_io_offset_src(instr);
4266
4267    if (!nir_src_is_const(*off_src)) {
4268       *indirect = true;
4269       return false;
4270    }
4271
4272    *indirect = false;
4273    uint64_t slot = per_vertex
4274                    ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
4275                    : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
4276    return (((uint64_t) 1) << slot) & mask;
4277 }
4278
4279 bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
4280 {
4281    unsigned write_mask = nir_intrinsic_write_mask(instr);
4282    unsigned component = nir_intrinsic_component(instr);
4283    unsigned idx = nir_intrinsic_base(instr) + component;
4284
4285    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
4286    if (off_instr->type != nir_instr_type_load_const)
4287       return false;
4288
4289    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4290    idx += nir_src_as_uint(instr->src[1]) * 4u;
4291
4292    if (instr->src[0].ssa->bit_size == 64)
4293       write_mask = widen_mask(write_mask, 2);
4294
4295    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4296
4297    for (unsigned i = 0; i < 8; ++i) {
4298       if (write_mask & (1 << i)) {
4299          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4300          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4301       }
4302       idx++;
4303    }
4304
4305    return true;
4306 }
4307
4308 bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
4309 {
4310    /* Only TCS per-vertex inputs are supported by this function.
4311     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
4312     */
4313    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4314       return false;
4315
4316    nir_src *off_src = nir_get_io_offset_src(instr);
4317    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4318    nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
4319    bool can_use_temps = nir_src_is_const(*off_src) &&
4320                         vertex_index_instr->type == nir_instr_type_intrinsic &&
4321                         nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4322
4323    if (!can_use_temps)
4324       return false;
4325
4326    unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
4327    Temp *src = &ctx->inputs.temps[idx];
4328    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4329
4330    return true;
4331 }
4332
4333 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
4334 {
4335    Builder bld(ctx->program, ctx->block);
4336
4337    if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
4338       /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
4339       bool indirect_write;
4340       bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
4341       if (temp_only_input && !indirect_write)
4342          return;
4343    }
4344
4345    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
4346    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4347    unsigned write_mask = nir_intrinsic_write_mask(instr);
4348    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
4349
4350    if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
4351       /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
4352       Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
4353       Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
4354       store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
4355    } else {
4356       Temp lds_base;
4357
4358       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4359          /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
4360          unsigned itemsize = ctx->stage == vertex_geometry_gs
4361                              ? ctx->program->info->vs.es_info.esgs_itemsize
4362                              : ctx->program->info->tes.es_info.esgs_itemsize;
4363          Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
4364          Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
4365          Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
4366                                bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
4367          lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
4368       } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
4369          /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
4370           * GFX9+: LS is merged into HS, but still uses the same LDS layout.
4371           */
4372          Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
4373          lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
4374       } else {
4375          unreachable("Invalid LS or ES stage");
4376       }
4377
4378       offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
4379       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
4380       store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
4381    }
4382 }
4383
4384 bool tcs_output_is_tess_factor(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4385 {
4386    if (per_vertex)
4387       return false;
4388
4389    unsigned off = nir_intrinsic_base(instr) * 4u;
4390    return off == ctx->tcs_tess_lvl_out_loc ||
4391           off == ctx->tcs_tess_lvl_in_loc;
4392
4393 }
4394
4395 bool tcs_output_is_read_by_tes(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4396 {
4397    uint64_t mask = per_vertex
4398                    ? ctx->program->info->tcs.tes_inputs_read
4399                    : ctx->program->info->tcs.tes_patch_inputs_read;
4400
4401    bool indirect_write = false;
4402    bool output_read_by_tes = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4403    return indirect_write || output_read_by_tes;
4404 }
4405
4406 bool tcs_output_is_read_by_tcs(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4407 {
4408    uint64_t mask = per_vertex
4409                    ? ctx->shader->info.outputs_read
4410                    : ctx->shader->info.patch_outputs_read;
4411
4412    bool indirect_write = false;
4413    bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4414    return indirect_write || output_read;
4415 }
4416
4417 void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4418 {
4419    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4420    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4421
4422    Builder bld(ctx->program, ctx->block);
4423
4424    Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
4425    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4426    unsigned write_mask = nir_intrinsic_write_mask(instr);
4427
4428    bool is_tess_factor = tcs_output_is_tess_factor(ctx, instr, per_vertex);
4429    bool write_to_vmem = !is_tess_factor && tcs_output_is_read_by_tes(ctx, instr, per_vertex);
4430    bool write_to_lds = is_tess_factor || tcs_output_is_read_by_tcs(ctx, instr, per_vertex);
4431
4432    if (write_to_vmem) {
4433       std::pair<Temp, unsigned> vmem_offs = per_vertex
4434                                             ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
4435                                             : get_tcs_per_patch_output_vmem_offset(ctx, instr);
4436
4437       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4438       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
4439       store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
4440    }
4441
4442    if (write_to_lds) {
4443       std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4444       unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4445       store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
4446    }
4447 }
4448
4449 void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4450 {
4451    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4452    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4453
4454    Builder bld(ctx->program, ctx->block);
4455
4456    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4457    std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4458    unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4459    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4460
4461    load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
4462 }
4463
4464 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
4465 {
4466    if (ctx->stage == vertex_vs ||
4467        ctx->stage == tess_eval_vs ||
4468        ctx->stage == fragment_fs ||
4469        ctx->stage == ngg_vertex_gs ||
4470        ctx->stage == ngg_tess_eval_gs ||
4471        ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4472       bool stored_to_temps = store_output_to_temps(ctx, instr);
4473       if (!stored_to_temps) {
4474          fprintf(stderr, "Unimplemented output offset instruction:\n");
4475          nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
4476          fprintf(stderr, "\n");
4477          abort();
4478       }
4479    } else if (ctx->stage == vertex_es ||
4480               ctx->stage == vertex_ls ||
4481               ctx->stage == tess_eval_es ||
4482               (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4483               (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4484               (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) {
4485       visit_store_ls_or_es_output(ctx, instr);
4486    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
4487       visit_store_tcs_output(ctx, instr, false);
4488    } else {
4489       unreachable("Shader stage not implemented");
4490    }
4491 }
4492
4493 void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
4494 {
4495    visit_load_tcs_output(ctx, instr, false);
4496 }
4497
4498 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
4499 {
4500    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4501    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4502
4503    Builder bld(ctx->program, ctx->block);
4504
4505    if (dst.regClass() == v2b) {
4506       if (ctx->program->has_16bank_lds) {
4507          assert(ctx->options->chip_class <= GFX8);
4508          Builder::Result interp_p1 =
4509             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
4510                        Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
4511          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
4512                                 coord1, bld.m0(prim_mask), interp_p1, idx, component);
4513          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
4514                  bld.m0(prim_mask), interp_p1, idx, component);
4515       } else {
4516          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4517
4518          if (ctx->options->chip_class == GFX8)
4519             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4520
4521          Builder::Result interp_p1 =
4522             bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
4523                        coord1, bld.m0(prim_mask), idx, component);
4524          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
4525                     interp_p1, idx, component);
4526       }
4527    } else {
4528       Builder::Result interp_p1 =
4529          bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4530                     bld.m0(prim_mask), idx, component);
4531
4532       if (ctx->program->has_16bank_lds)
4533          interp_p1.instr->operands[0].setLateKill(true);
4534
4535       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
4536                  bld.m0(prim_mask), interp_p1, idx, component);
4537    }
4538 }
4539
4540 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
4541 {
4542    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4543    for (unsigned i = 0; i < num_components; i++)
4544       vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4545    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4546       assert(num_components == 4);
4547       Builder bld(ctx->program, ctx->block);
4548       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4549    }
4550
4551    for (Operand& op : vec->operands)
4552       op = op.isUndefined() ? Operand(0u) : op;
4553
4554    vec->definitions[0] = Definition(dst);
4555    ctx->block->instructions.emplace_back(std::move(vec));
4556    emit_split_vector(ctx, dst, num_components);
4557    return;
4558 }
4559
4560 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
4561 {
4562    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4563    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4564    unsigned idx = nir_intrinsic_base(instr);
4565    unsigned component = nir_intrinsic_component(instr);
4566    Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4567
4568    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
4569    if (offset) {
4570       assert(offset->u32 == 0);
4571    } else {
4572       /* the lower 15bit of the prim_mask contain the offset into LDS
4573        * while the upper bits contain the number of prims */
4574       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
4575       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4576       Builder bld(ctx->program, ctx->block);
4577       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4578       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4579       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4580       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4581       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4582    }
4583
4584    if (instr->dest.ssa.num_components == 1) {
4585       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4586    } else {
4587       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4588       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
4589       {
4590          Temp tmp = {ctx->program->allocateId(), v1};
4591          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
4592          vec->operands[i] = Operand(tmp);
4593       }
4594       vec->definitions[0] = Definition(dst);
4595       ctx->block->instructions.emplace_back(std::move(vec));
4596    }
4597 }
4598
4599 bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
4600                              unsigned offset, unsigned stride, unsigned channels)
4601 {
4602    unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4603    if (vtx_info->chan_byte_size != 4 && channels == 3)
4604       return false;
4605    return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
4606           (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
4607 }
4608
4609 uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
4610                               unsigned offset, unsigned stride, unsigned *channels)
4611 {
4612    if (!vtx_info->chan_byte_size) {
4613       *channels = vtx_info->num_channels;
4614       return vtx_info->chan_format;
4615    }
4616
4617    unsigned num_channels = *channels;
4618    if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
4619       unsigned new_channels = num_channels + 1;
4620       /* first, assume more loads is worse and try using a larger data format */
4621       while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
4622          new_channels++;
4623          /* don't make the attribute potentially out-of-bounds */
4624          if (offset + new_channels * vtx_info->chan_byte_size > stride)
4625             new_channels = 5;
4626       }
4627
4628       if (new_channels == 5) {
4629          /* then try decreasing load size (at the cost of more loads) */
4630          new_channels = *channels;
4631          while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
4632             new_channels--;
4633       }
4634
4635       if (new_channels < *channels)
4636          *channels = new_channels;
4637       num_channels = new_channels;
4638    }
4639
4640    switch (vtx_info->chan_format) {
4641    case V_008F0C_BUF_DATA_FORMAT_8:
4642       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4643                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4644    case V_008F0C_BUF_DATA_FORMAT_16:
4645       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4646                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4647    case V_008F0C_BUF_DATA_FORMAT_32:
4648       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4649                          V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4650    }
4651    unreachable("shouldn't reach here");
4652    return V_008F0C_BUF_DATA_FORMAT_INVALID;
4653 }
4654
4655 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4656  * so we may need to fix it up. */
4657 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
4658 {
4659    Builder bld(ctx->program, ctx->block);
4660
4661    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
4662       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4663
4664    /* For the integer-like cases, do a natural sign extension.
4665     *
4666     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4667     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4668     * exponent.
4669     */
4670    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
4671    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
4672
4673    /* Convert back to the right type. */
4674    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
4675       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4676       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
4677       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
4678    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
4679       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4680    }
4681
4682    return alpha;
4683 }
4684
4685 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
4686 {
4687    Builder bld(ctx->program, ctx->block);
4688    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4689    if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
4690
4691       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
4692       if (off_instr->type != nir_instr_type_load_const) {
4693          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4694          nir_print_instr(off_instr, stderr);
4695          fprintf(stderr, "\n");
4696       }
4697       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
4698
4699       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
4700
4701       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
4702       unsigned component = nir_intrinsic_component(instr);
4703       unsigned bitsize = instr->dest.ssa.bit_size;
4704       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
4705       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
4706       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
4707       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
4708
4709       unsigned dfmt = attrib_format & 0xf;
4710       unsigned nfmt = (attrib_format >> 4) & 0x7;
4711       const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
4712
4713       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
4714       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4715       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
4716       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
4717       if (post_shuffle)
4718          num_channels = MAX2(num_channels, 3);
4719
4720       Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u));
4721       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
4722
4723       Temp index;
4724       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
4725          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
4726          Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
4727          if (divisor) {
4728             Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
4729             if (divisor != 1) {
4730                Temp divided = bld.tmp(v1);
4731                emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
4732                index = bld.vadd32(bld.def(v1), start_instance, divided);
4733             } else {
4734                index = bld.vadd32(bld.def(v1), start_instance, instance_id);
4735             }
4736          } else {
4737             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
4738          }
4739       } else {
4740          index = bld.vadd32(bld.def(v1),
4741                             get_arg(ctx, ctx->args->ac.base_vertex),
4742                             get_arg(ctx, ctx->args->ac.vertex_id));
4743       }
4744
4745       Temp channels[num_channels];
4746       unsigned channel_start = 0;
4747       bool direct_fetch = false;
4748
4749       /* skip unused channels at the start */
4750       if (vtx_info->chan_byte_size && !post_shuffle) {
4751          channel_start = ffs(mask) - 1;
4752          for (unsigned i = 0; i < channel_start; i++)
4753             channels[i] = Temp(0, s1);
4754       } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
4755          num_channels = 3 - (ffs(mask) - 1);
4756       }
4757
4758       /* load channels */
4759       while (channel_start < num_channels) {
4760          unsigned fetch_component = num_channels - channel_start;
4761          unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
4762          bool expanded = false;
4763
4764          /* use MUBUF when possible to avoid possible alignment issues */
4765          /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
4766          bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
4767                            nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
4768                            nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
4769                           vtx_info->chan_byte_size == 4;
4770          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
4771          if (!use_mubuf) {
4772             fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
4773          } else {
4774             if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
4775                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
4776                fetch_component = 4;
4777                expanded = true;
4778             }
4779          }
4780
4781          unsigned fetch_bytes = fetch_component * bitsize / 8;
4782
4783          Temp fetch_index = index;
4784          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
4785             fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
4786             fetch_offset = fetch_offset % attrib_stride;
4787          }
4788
4789          Operand soffset(0u);
4790          if (fetch_offset >= 4096) {
4791             soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
4792             fetch_offset %= 4096;
4793          }
4794
4795          aco_opcode opcode;
4796          switch (fetch_bytes) {
4797          case 2:
4798             assert(!use_mubuf && bitsize == 16);
4799             opcode = aco_opcode::tbuffer_load_format_d16_x;
4800             break;
4801          case 4:
4802             if (bitsize == 16) {
4803                assert(!use_mubuf);
4804                opcode = aco_opcode::tbuffer_load_format_d16_xy;
4805             } else {
4806                opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
4807             }
4808             break;
4809          case 6:
4810             assert(!use_mubuf && bitsize == 16);
4811             opcode = aco_opcode::tbuffer_load_format_d16_xyz;
4812             break;
4813          case 8:
4814             if (bitsize == 16) {
4815                assert(!use_mubuf);
4816                opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
4817             } else {
4818                opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
4819             }
4820             break;
4821          case 12:
4822             assert(ctx->options->chip_class >= GFX7 ||
4823                    (!use_mubuf && ctx->options->chip_class == GFX6));
4824             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
4825             break;
4826          case 16:
4827             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
4828             break;
4829          default:
4830             unreachable("Unimplemented load_input vector size");
4831          }
4832
4833          Temp fetch_dst;
4834          if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
4835              !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
4836                            num_channels <= 3)) {
4837             direct_fetch = true;
4838             fetch_dst = dst;
4839          } else {
4840             fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
4841          }
4842
4843          if (use_mubuf) {
4844             Instruction *mubuf = bld.mubuf(opcode,
4845                                            Definition(fetch_dst), list, fetch_index, soffset,
4846                                            fetch_offset, false, false, true).instr;
4847             static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
4848          } else {
4849             Instruction *mtbuf = bld.mtbuf(opcode,
4850                                            Definition(fetch_dst), list, fetch_index, soffset,
4851                                            fetch_dfmt, nfmt, fetch_offset, false, true).instr;
4852             static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
4853          }
4854
4855          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
4856
4857          if (fetch_component == 1) {
4858             channels[channel_start] = fetch_dst;
4859          } else {
4860             for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
4861                channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
4862                                                                  bitsize == 16 ? v2b : v1);
4863          }
4864
4865          channel_start += fetch_component;
4866       }
4867
4868       if (!direct_fetch) {
4869          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
4870                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
4871
4872          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
4873          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
4874          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
4875
4876          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4877          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4878          unsigned num_temp = 0;
4879          for (unsigned i = 0; i < dst.size(); i++) {
4880             unsigned idx = i + component;
4881             if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
4882                Temp channel = channels[swizzle[idx]];
4883                if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
4884                   channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
4885                vec->operands[i] = Operand(channel);
4886
4887                num_temp++;
4888                elems[i] = channel;
4889             } else if (is_float && idx == 3) {
4890                vec->operands[i] = Operand(0x3f800000u);
4891             } else if (!is_float && idx == 3) {
4892                vec->operands[i] = Operand(1u);
4893             } else {
4894                vec->operands[i] = Operand(0u);
4895             }
4896          }
4897          vec->definitions[0] = Definition(dst);
4898          ctx->block->instructions.emplace_back(std::move(vec));
4899          emit_split_vector(ctx, dst, dst.size());
4900
4901          if (num_temp == dst.size())
4902             ctx->allocated_vec.emplace(dst.id(), elems);
4903       }
4904    } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
4905       unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
4906       nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
4907       if (off_instr->type != nir_instr_type_load_const ||
4908           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
4909          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4910          nir_print_instr(off_instr, stderr);
4911          fprintf(stderr, "\n");
4912       }
4913
4914       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4915       nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]);
4916       if (offset) {
4917          assert(offset->u32 == 0);
4918       } else {
4919          /* the lower 15bit of the prim_mask contain the offset into LDS
4920           * while the upper bits contain the number of prims */
4921          Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa);
4922          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4923          Builder bld(ctx->program, ctx->block);
4924          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4925          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4926          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4927          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4928          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4929       }
4930
4931       unsigned idx = nir_intrinsic_base(instr);
4932       unsigned component = nir_intrinsic_component(instr);
4933       unsigned vertex_id = 2; /* P0 */
4934
4935       if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
4936          nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
4937          switch (src0->u32) {
4938          case 0:
4939             vertex_id = 2; /* P0 */
4940             break;
4941          case 1:
4942             vertex_id = 0; /* P10 */
4943             break;
4944          case 2:
4945             vertex_id = 1; /* P20 */
4946             break;
4947          default:
4948             unreachable("invalid vertex index");
4949          }
4950       }
4951
4952       if (dst.size() == 1) {
4953          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
4954       } else {
4955          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4956          for (unsigned i = 0; i < dst.size(); i++)
4957             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
4958          vec->definitions[0] = Definition(dst);
4959          bld.insert(std::move(vec));
4960       }
4961
4962    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_EVAL) {
4963       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4964       Temp soffset = get_arg(ctx, ctx->args->oc_lds);
4965       std::pair<Temp, unsigned> offs = get_tcs_per_patch_output_vmem_offset(ctx, instr);
4966       unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8u;
4967
4968       load_vmem_mubuf(ctx, dst, ring, offs.first, soffset, offs.second, elem_size_bytes, instr->dest.ssa.num_components);
4969    } else {
4970       unreachable("Shader stage not implemented");
4971    }
4972 }
4973
4974 std::pair<Temp, unsigned> get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u)
4975 {
4976    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
4977
4978    Builder bld(ctx->program, ctx->block);
4979    nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
4980    Temp vertex_offset;
4981
4982    if (!nir_src_is_const(*vertex_src)) {
4983       /* better code could be created, but this case probably doesn't happen
4984        * much in practice */
4985       Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa));
4986       for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
4987          Temp elem;
4988
4989          if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4990             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
4991             if (i % 2u)
4992                elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
4993          } else {
4994             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]);
4995          }
4996
4997          if (vertex_offset.id()) {
4998             Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)),
4999                                  Operand(i), indirect_vertex);
5000             vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond);
5001          } else {
5002             vertex_offset = elem;
5003          }
5004       }
5005
5006       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
5007          vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset);
5008    } else {
5009       unsigned vertex = nir_src_as_uint(*vertex_src);
5010       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
5011          vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
5012                                   get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
5013                                   Operand((vertex % 2u) * 16u), Operand(16u));
5014       else
5015          vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]);
5016    }
5017
5018    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride);
5019    offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u));
5020    return offset_mul(ctx, offs, 4u);
5021 }
5022
5023 void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5024 {
5025    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
5026
5027    Builder bld(ctx->program, ctx->block);
5028    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5029    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5030
5031    if (ctx->stage == geometry_gs) {
5032       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size);
5033       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u));
5034       load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true);
5035    } else if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
5036       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr);
5037       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
5038       load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
5039    } else {
5040       unreachable("Unsupported GS stage.");
5041    }
5042 }
5043
5044 void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5045 {
5046    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5047
5048    Builder bld(ctx->program, ctx->block);
5049    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5050
5051    if (load_input_from_temps(ctx, instr, dst))
5052       return;
5053
5054    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
5055    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5056    unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
5057
5058    load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
5059 }
5060
5061 void visit_load_tes_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5062 {
5063    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5064
5065    Builder bld(ctx->program, ctx->block);
5066
5067    Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
5068    Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
5069    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5070
5071    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5072    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_output_vmem_offset(ctx, instr);
5073
5074    load_vmem_mubuf(ctx, dst, ring, offs.first, oc_lds, offs.second, elem_size_bytes, instr->dest.ssa.num_components, 0u, true, true);
5075 }
5076
5077 void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5078 {
5079    switch (ctx->shader->info.stage) {
5080    case MESA_SHADER_GEOMETRY:
5081       visit_load_gs_per_vertex_input(ctx, instr);
5082       break;
5083    case MESA_SHADER_TESS_CTRL:
5084       visit_load_tcs_per_vertex_input(ctx, instr);
5085       break;
5086    case MESA_SHADER_TESS_EVAL:
5087       visit_load_tes_per_vertex_input(ctx, instr);
5088       break;
5089    default:
5090       unreachable("Unimplemented shader stage");
5091    }
5092 }
5093
5094 void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5095 {
5096    visit_load_tcs_output(ctx, instr, true);
5097 }
5098
5099 void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5100 {
5101    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
5102    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5103
5104    visit_store_tcs_output(ctx, instr, true);
5105 }
5106
5107 void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
5108 {
5109    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5110
5111    Builder bld(ctx->program, ctx->block);
5112    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5113
5114    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5115    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5116    Operand tes_w(0u);
5117
5118    if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5119       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5120       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp);
5121       tes_w = Operand(tmp);
5122    }
5123
5124    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5125    emit_split_vector(ctx, tess_coord, 3);
5126 }
5127
5128 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
5129 {
5130    if (ctx->program->info->need_indirect_descriptor_sets) {
5131       Builder bld(ctx->program, ctx->block);
5132       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5133       Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
5134       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
5135    }
5136
5137    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5138 }
5139
5140
5141 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
5142 {
5143    Builder bld(ctx->program, ctx->block);
5144    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5145    if (!nir_dest_is_divergent(instr->dest))
5146       index = bld.as_uniform(index);
5147    unsigned desc_set = nir_intrinsic_desc_set(instr);
5148    unsigned binding = nir_intrinsic_binding(instr);
5149
5150    Temp desc_ptr;
5151    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
5152    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
5153    unsigned offset = layout->binding[binding].offset;
5154    unsigned stride;
5155    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5156        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5157       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
5158       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5159       offset = pipeline_layout->push_constant_size + 16 * idx;
5160       stride = 16;
5161    } else {
5162       desc_ptr = load_desc_ptr(ctx, desc_set);
5163       stride = layout->binding[binding].size;
5164    }
5165
5166    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
5167    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
5168    if (stride != 1) {
5169       if (nir_const_index) {
5170          const_index = const_index * stride;
5171       } else if (index.type() == RegType::vgpr) {
5172          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5173          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5174       } else {
5175          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
5176       }
5177    }
5178    if (offset) {
5179       if (nir_const_index) {
5180          const_index = const_index + offset;
5181       } else if (index.type() == RegType::vgpr) {
5182          index = bld.vadd32(bld.def(v1), Operand(offset), index);
5183       } else {
5184          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
5185       }
5186    }
5187
5188    if (nir_const_index && const_index == 0) {
5189       index = desc_ptr;
5190    } else if (index.type() == RegType::vgpr) {
5191       index = bld.vadd32(bld.def(v1),
5192                          nir_const_index ? Operand(const_index) : Operand(index),
5193                          Operand(desc_ptr));
5194    } else {
5195       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5196                        nir_const_index ? Operand(const_index) : Operand(index),
5197                        Operand(desc_ptr));
5198    }
5199
5200    bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
5201 }
5202
5203 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
5204                  Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5205                  bool glc=false, bool readonly=true, bool allow_smem=true)
5206 {
5207    Builder bld(ctx->program, ctx->block);
5208
5209    bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5210    if (use_smem)
5211       offset = bld.as_uniform(offset);
5212
5213    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5214    info.glc = glc;
5215    info.barrier = readonly ? barrier_none : barrier_buffer;
5216    info.can_reorder = readonly;
5217    info.align_mul = align_mul;
5218    info.align_offset = align_offset;
5219    if (use_smem)
5220       emit_smem_load(ctx, bld, &info);
5221    else
5222       emit_mubuf_load(ctx, bld, &info);
5223 }
5224
5225 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
5226 {
5227    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5228    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5229
5230    Builder bld(ctx->program, ctx->block);
5231
5232    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
5233    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
5234    unsigned binding = nir_intrinsic_binding(idx_instr);
5235    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
5236
5237    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
5238       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5239                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5240                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5241                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5242       if (ctx->options->chip_class >= GFX10) {
5243          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5244                       S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5245                       S_008F0C_RESOURCE_LEVEL(1);
5246       } else {
5247          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5248                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5249       }
5250       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
5251                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5252                                      Operand(0xFFFFFFFFu),
5253                                      Operand(desc_type));
5254       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5255                         rsrc, upper_dwords);
5256    } else {
5257       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
5258       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5259    }
5260    unsigned size = instr->dest.ssa.bit_size / 8;
5261    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5262                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5263 }
5264
5265 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5266 {
5267    Builder bld(ctx->program, ctx->block);
5268    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5269    unsigned offset = nir_intrinsic_base(instr);
5270    unsigned count = instr->dest.ssa.num_components;
5271    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
5272
5273    if (index_cv && instr->dest.ssa.bit_size == 32) {
5274       unsigned start = (offset + index_cv->u32) / 4u;
5275       start -= ctx->args->ac.base_inline_push_consts;
5276       if (start + count <= ctx->args->ac.num_inline_push_consts) {
5277          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
5278          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5279          for (unsigned i = 0; i < count; ++i) {
5280             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5281             vec->operands[i] = Operand{elems[i]};
5282          }
5283          vec->definitions[0] = Definition(dst);
5284          ctx->block->instructions.emplace_back(std::move(vec));
5285          ctx->allocated_vec.emplace(dst.id(), elems);
5286          return;
5287       }
5288    }
5289
5290    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5291    if (offset != 0) // TODO check if index != 0 as well
5292       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
5293    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5294    Temp vec = dst;
5295    bool trim = false;
5296    bool aligned = true;
5297
5298    if (instr->dest.ssa.bit_size == 8) {
5299       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5300       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5301       if (!aligned)
5302          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5303    } else if (instr->dest.ssa.bit_size == 16) {
5304       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5305       if (!aligned)
5306          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5307    }
5308
5309    aco_opcode op;
5310
5311    switch (vec.size()) {
5312    case 1:
5313       op = aco_opcode::s_load_dword;
5314       break;
5315    case 2:
5316       op = aco_opcode::s_load_dwordx2;
5317       break;
5318    case 3:
5319       vec = bld.tmp(s4);
5320       trim = true;
5321    case 4:
5322       op = aco_opcode::s_load_dwordx4;
5323       break;
5324    case 6:
5325       vec = bld.tmp(s8);
5326       trim = true;
5327    case 8:
5328       op = aco_opcode::s_load_dwordx8;
5329       break;
5330    default:
5331       unreachable("unimplemented or forbidden load_push_constant.");
5332    }
5333
5334    static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
5335
5336    if (!aligned) {
5337       Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
5338       byte_align_scalar(ctx, vec, byte_offset, dst);
5339       return;
5340    }
5341
5342    if (trim) {
5343       emit_split_vector(ctx, vec, 4);
5344       RegClass rc = dst.size() == 3 ? s1 : s2;
5345       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5346                  emit_extract_vector(ctx, vec, 0, rc),
5347                  emit_extract_vector(ctx, vec, 1, rc),
5348                  emit_extract_vector(ctx, vec, 2, rc));
5349
5350    }
5351    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5352 }
5353
5354 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5355 {
5356    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5357
5358    Builder bld(ctx->program, ctx->block);
5359
5360    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5361                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5362                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5363                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5364    if (ctx->options->chip_class >= GFX10) {
5365       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5366                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5367                    S_008F0C_RESOURCE_LEVEL(1);
5368    } else {
5369       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5370                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5371    }
5372
5373    unsigned base = nir_intrinsic_base(instr);
5374    unsigned range = nir_intrinsic_range(instr);
5375
5376    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5377    if (base && offset.type() == RegType::sgpr)
5378       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
5379    else if (base && offset.type() == RegType::vgpr)
5380       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
5381
5382    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5383                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
5384                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
5385                           Operand(desc_type));
5386    unsigned size = instr->dest.ssa.bit_size / 8;
5387    // TODO: get alignment information for subdword constants
5388    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5389 }
5390
5391 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
5392 {
5393    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5394       ctx->cf_info.exec_potentially_empty_discard = true;
5395
5396    ctx->program->needs_exact = true;
5397
5398    // TODO: optimize uniform conditions
5399    Builder bld(ctx->program, ctx->block);
5400    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5401    assert(src.regClass() == bld.lm);
5402    src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5403    bld.pseudo(aco_opcode::p_discard_if, src);
5404    ctx->block->kind |= block_kind_uses_discard_if;
5405    return;
5406 }
5407
5408 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
5409 {
5410    Builder bld(ctx->program, ctx->block);
5411
5412    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5413       ctx->cf_info.exec_potentially_empty_discard = true;
5414
5415    bool divergent = ctx->cf_info.parent_if.is_divergent ||
5416                     ctx->cf_info.parent_loop.has_divergent_continue;
5417
5418    if (ctx->block->loop_nest_depth &&
5419        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
5420       /* we handle discards the same way as jump instructions */
5421       append_logical_end(ctx->block);
5422
5423       /* in loops, discard behaves like break */
5424       Block *linear_target = ctx->cf_info.parent_loop.exit;
5425       ctx->block->kind |= block_kind_discard;
5426
5427       if (!divergent) {
5428          /* uniform discard - loop ends here */
5429          assert(nir_instr_is_last(&instr->instr));
5430          ctx->block->kind |= block_kind_uniform;
5431          ctx->cf_info.has_branch = true;
5432          bld.branch(aco_opcode::p_branch);
5433          add_linear_edge(ctx->block->index, linear_target);
5434          return;
5435       }
5436
5437       /* we add a break right behind the discard() instructions */
5438       ctx->block->kind |= block_kind_break;
5439       unsigned idx = ctx->block->index;
5440
5441       ctx->cf_info.parent_loop.has_divergent_branch = true;
5442       ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx;
5443
5444       /* remove critical edges from linear CFG */
5445       bld.branch(aco_opcode::p_branch);
5446       Block* break_block = ctx->program->create_and_insert_block();
5447       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5448       break_block->kind |= block_kind_uniform;
5449       add_linear_edge(idx, break_block);
5450       add_linear_edge(break_block->index, linear_target);
5451       bld.reset(break_block);
5452       bld.branch(aco_opcode::p_branch);
5453
5454       Block* continue_block = ctx->program->create_and_insert_block();
5455       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5456       add_linear_edge(idx, continue_block);
5457       append_logical_start(continue_block);
5458       ctx->block = continue_block;
5459
5460       return;
5461    }
5462
5463    /* it can currently happen that NIR doesn't remove the unreachable code */
5464    if (!nir_instr_is_last(&instr->instr)) {
5465       ctx->program->needs_exact = true;
5466       /* save exec somewhere temporarily so that it doesn't get
5467        * overwritten before the discard from outer exec masks */
5468       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
5469       bld.pseudo(aco_opcode::p_discard_if, cond);
5470       ctx->block->kind |= block_kind_uses_discard_if;
5471       return;
5472    }
5473
5474    /* This condition is incorrect for uniformly branched discards in a loop
5475     * predicated by a divergent condition, but the above code catches that case
5476     * and the discard would end up turning into a discard_if.
5477     * For example:
5478     * if (divergent) {
5479     *    while (...) {
5480     *       if (uniform) {
5481     *          discard;
5482     *       }
5483     *    }
5484     * }
5485     */
5486    if (!ctx->cf_info.parent_if.is_divergent) {
5487       /* program just ends here */
5488       ctx->block->kind |= block_kind_uniform;
5489       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
5490               0 /* enabled mask */, 9 /* dest */,
5491               false /* compressed */, true/* done */, true /* valid mask */);
5492       bld.sopp(aco_opcode::s_endpgm);
5493       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5494    } else {
5495       ctx->block->kind |= block_kind_discard;
5496       /* branch and linear edge is added by visit_if() */
5497    }
5498 }
5499
5500 enum aco_descriptor_type {
5501    ACO_DESC_IMAGE,
5502    ACO_DESC_FMASK,
5503    ACO_DESC_SAMPLER,
5504    ACO_DESC_BUFFER,
5505    ACO_DESC_PLANE_0,
5506    ACO_DESC_PLANE_1,
5507    ACO_DESC_PLANE_2,
5508 };
5509
5510 static bool
5511 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
5512    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5513       return false;
5514    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5515    return dim == ac_image_cube ||
5516           dim == ac_image_1darray ||
5517           dim == ac_image_2darray ||
5518           dim == ac_image_2darraymsaa;
5519 }
5520
5521 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
5522                       enum aco_descriptor_type desc_type,
5523                       const nir_tex_instr *tex_instr, bool image, bool write)
5524 {
5525 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5526    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
5527    if (it != ctx->tex_desc.end())
5528       return it->second;
5529 */
5530    Temp index = Temp();
5531    bool index_set = false;
5532    unsigned constant_index = 0;
5533    unsigned descriptor_set;
5534    unsigned base_index;
5535    Builder bld(ctx->program, ctx->block);
5536
5537    if (!deref_instr) {
5538       assert(tex_instr && !image);
5539       descriptor_set = 0;
5540       base_index = tex_instr->sampler_index;
5541    } else {
5542       while(deref_instr->deref_type != nir_deref_type_var) {
5543          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5544          if (!array_size)
5545             array_size = 1;
5546
5547          assert(deref_instr->deref_type == nir_deref_type_array);
5548          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
5549          if (const_value) {
5550             constant_index += array_size * const_value->u32;
5551          } else {
5552             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5553             if (indirect.type() == RegType::vgpr)
5554                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
5555
5556             if (array_size != 1)
5557                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
5558
5559             if (!index_set) {
5560                index = indirect;
5561                index_set = true;
5562             } else {
5563                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5564             }
5565          }
5566
5567          deref_instr = nir_src_as_deref(deref_instr->parent);
5568       }
5569       descriptor_set = deref_instr->var->data.descriptor_set;
5570       base_index = deref_instr->var->data.binding;
5571    }
5572
5573    Temp list = load_desc_ptr(ctx, descriptor_set);
5574    list = convert_pointer_to_64_bit(ctx, list);
5575
5576    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
5577    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
5578    unsigned offset = binding->offset;
5579    unsigned stride = binding->size;
5580    aco_opcode opcode;
5581    RegClass type;
5582
5583    assert(base_index < layout->binding_count);
5584
5585    switch (desc_type) {
5586    case ACO_DESC_IMAGE:
5587       type = s8;
5588       opcode = aco_opcode::s_load_dwordx8;
5589       break;
5590    case ACO_DESC_FMASK:
5591       type = s8;
5592       opcode = aco_opcode::s_load_dwordx8;
5593       offset += 32;
5594       break;
5595    case ACO_DESC_SAMPLER:
5596       type = s4;
5597       opcode = aco_opcode::s_load_dwordx4;
5598       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5599          offset += radv_combined_image_descriptor_sampler_offset(binding);
5600       break;
5601    case ACO_DESC_BUFFER:
5602       type = s4;
5603       opcode = aco_opcode::s_load_dwordx4;
5604       break;
5605    case ACO_DESC_PLANE_0:
5606    case ACO_DESC_PLANE_1:
5607       type = s8;
5608       opcode = aco_opcode::s_load_dwordx8;
5609       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5610       break;
5611    case ACO_DESC_PLANE_2:
5612       type = s4;
5613       opcode = aco_opcode::s_load_dwordx4;
5614       offset += 64;
5615       break;
5616    default:
5617       unreachable("invalid desc_type\n");
5618    }
5619
5620    offset += constant_index * stride;
5621
5622    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5623       (!index_set || binding->immutable_samplers_equal)) {
5624       if (binding->immutable_samplers_equal)
5625          constant_index = 0;
5626
5627       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
5628       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5629                         Operand(samplers[constant_index * 4 + 0]),
5630                         Operand(samplers[constant_index * 4 + 1]),
5631                         Operand(samplers[constant_index * 4 + 2]),
5632                         Operand(samplers[constant_index * 4 + 3]));
5633    }
5634
5635    Operand off;
5636    if (!index_set) {
5637       off = bld.copy(bld.def(s1), Operand(offset));
5638    } else {
5639       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
5640                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
5641    }
5642
5643    Temp res = bld.smem(opcode, bld.def(type), list, off);
5644
5645    if (desc_type == ACO_DESC_PLANE_2) {
5646       Temp components[8];
5647       for (unsigned i = 0; i < 8; i++)
5648          components[i] = bld.tmp(s1);
5649       bld.pseudo(aco_opcode::p_split_vector,
5650                  Definition(components[0]),
5651                  Definition(components[1]),
5652                  Definition(components[2]),
5653                  Definition(components[3]),
5654                  res);
5655
5656       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
5657       bld.pseudo(aco_opcode::p_split_vector,
5658                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5659                  Definition(components[4]),
5660                  Definition(components[5]),
5661                  Definition(components[6]),
5662                  Definition(components[7]),
5663                  desc2);
5664
5665       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
5666                        components[0], components[1], components[2], components[3],
5667                        components[4], components[5], components[6], components[7]);
5668    }
5669
5670    return res;
5671 }
5672
5673 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5674 {
5675    switch (dim) {
5676    case GLSL_SAMPLER_DIM_BUF:
5677       return 1;
5678    case GLSL_SAMPLER_DIM_1D:
5679       return array ? 2 : 1;
5680    case GLSL_SAMPLER_DIM_2D:
5681       return array ? 3 : 2;
5682    case GLSL_SAMPLER_DIM_MS:
5683       return array ? 4 : 3;
5684    case GLSL_SAMPLER_DIM_3D:
5685    case GLSL_SAMPLER_DIM_CUBE:
5686       return 3;
5687    case GLSL_SAMPLER_DIM_RECT:
5688    case GLSL_SAMPLER_DIM_SUBPASS:
5689       return 2;
5690    case GLSL_SAMPLER_DIM_SUBPASS_MS:
5691       return 3;
5692    default:
5693       break;
5694    }
5695    return 0;
5696 }
5697
5698
5699 /* Adjust the sample index according to FMASK.
5700  *
5701  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
5702  * which is the identity mapping. Each nibble says which physical sample
5703  * should be fetched to get that sample.
5704  *
5705  * For example, 0x11111100 means there are only 2 samples stored and
5706  * the second sample covers 3/4 of the pixel. When reading samples 0
5707  * and 1, return physical sample 0 (determined by the first two 0s
5708  * in FMASK), otherwise return physical sample 1.
5709  *
5710  * The sample index should be adjusted as follows:
5711  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
5712  */
5713 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
5714 {
5715    Builder bld(ctx->program, ctx->block);
5716    Temp fmask = bld.tmp(v1);
5717    unsigned dim = ctx->options->chip_class >= GFX10
5718                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
5719                   : 0;
5720
5721    Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) :
5722                      bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]);
5723    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
5724    load->operands[0] = Operand(fmask_desc_ptr);
5725    load->operands[1] = Operand(s4); /* no sampler */
5726    load->operands[2] = Operand(coord);
5727    load->definitions[0] = Definition(fmask);
5728    load->glc = false;
5729    load->dlc = false;
5730    load->dmask = 0x1;
5731    load->unrm = true;
5732    load->da = da;
5733    load->dim = dim;
5734    load->can_reorder = true; /* fmask images shouldn't be modified */
5735    ctx->block->instructions.emplace_back(std::move(load));
5736
5737    Operand sample_index4;
5738    if (sample_index.isConstant()) {
5739       if (sample_index.constantValue() < 16) {
5740          sample_index4 = Operand(sample_index.constantValue() << 2);
5741       } else {
5742          sample_index4 = Operand(0u);
5743       }
5744    } else if (sample_index.regClass() == s1) {
5745       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
5746    } else {
5747       assert(sample_index.regClass() == v1);
5748       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
5749    }
5750
5751    Temp final_sample;
5752    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
5753       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
5754    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
5755       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
5756    else
5757       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
5758
5759    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
5760     * resource descriptor is 0 (invalid),
5761     */
5762    Temp compare = bld.tmp(bld.lm);
5763    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
5764                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
5765
5766    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
5767
5768    /* Replace the MSAA sample index. */
5769    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
5770 }
5771
5772 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
5773 {
5774
5775    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
5776    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5777    bool is_array = glsl_sampler_type_is_array(type);
5778    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5779    assert(!add_frag_pos && "Input attachments should be lowered.");
5780    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5781    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
5782    int count = image_type_to_components_count(dim, is_array);
5783    std::vector<Temp> coords(count);
5784    Builder bld(ctx->program, ctx->block);
5785
5786    if (is_ms) {
5787       count--;
5788       Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
5789       /* get sample index */
5790       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
5791          nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
5792          Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
5793          std::vector<Temp> fmask_load_address;
5794          for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
5795             fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
5796
5797          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
5798          coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
5799       } else {
5800          coords[count] = emit_extract_vector(ctx, src2, 0, v1);
5801       }
5802    }
5803
5804    if (gfx9_1d) {
5805       coords[0] = emit_extract_vector(ctx, src0, 0, v1);
5806       coords.resize(coords.size() + 1);
5807       coords[1] = bld.copy(bld.def(v1), Operand(0u));
5808       if (is_array)
5809          coords[2] = emit_extract_vector(ctx, src0, 1, v1);
5810    } else {
5811       for (int i = 0; i < count; i++)
5812          coords[i] = emit_extract_vector(ctx, src0, i, v1);
5813    }
5814
5815    if (instr->intrinsic == nir_intrinsic_image_deref_load ||
5816        instr->intrinsic == nir_intrinsic_image_deref_store) {
5817       int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
5818       bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
5819
5820       if (!level_zero)
5821          coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
5822    }
5823
5824    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5825    for (unsigned i = 0; i < coords.size(); i++)
5826       vec->operands[i] = Operand(coords[i]);
5827    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
5828    vec->definitions[0] = Definition(res);
5829    ctx->block->instructions.emplace_back(std::move(vec));
5830    return res;
5831 }
5832
5833
5834 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
5835 {
5836    Builder bld(ctx->program, ctx->block);
5837    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5838    const struct glsl_type *type = glsl_without_array(var->type);
5839    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5840    bool is_array = glsl_sampler_type_is_array(type);
5841    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5842
5843    if (dim == GLSL_SAMPLER_DIM_BUF) {
5844       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
5845       unsigned num_channels = util_last_bit(mask);
5846       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5847       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5848
5849       aco_opcode opcode;
5850       switch (num_channels) {
5851       case 1:
5852          opcode = aco_opcode::buffer_load_format_x;
5853          break;
5854       case 2:
5855          opcode = aco_opcode::buffer_load_format_xy;
5856          break;
5857       case 3:
5858          opcode = aco_opcode::buffer_load_format_xyz;
5859          break;
5860       case 4:
5861          opcode = aco_opcode::buffer_load_format_xyzw;
5862          break;
5863       default:
5864          unreachable(">4 channel buffer image load");
5865       }
5866       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
5867       load->operands[0] = Operand(rsrc);
5868       load->operands[1] = Operand(vindex);
5869       load->operands[2] = Operand((uint32_t) 0);
5870       Temp tmp;
5871       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5872          tmp = dst;
5873       else
5874          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
5875       load->definitions[0] = Definition(tmp);
5876       load->idxen = true;
5877       load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
5878       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5879       load->barrier = barrier_image;
5880       ctx->block->instructions.emplace_back(std::move(load));
5881
5882       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
5883       return;
5884    }
5885
5886    Temp coords = get_image_coords(ctx, instr, type);
5887    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5888
5889    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
5890    unsigned num_components = util_bitcount(dmask);
5891    Temp tmp;
5892    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5893       tmp = dst;
5894    else
5895       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
5896
5897    bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
5898    aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
5899
5900    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
5901    load->operands[0] = Operand(resource);
5902    load->operands[1] = Operand(s4); /* no sampler */
5903    load->operands[2] = Operand(coords);
5904    load->definitions[0] = Definition(tmp);
5905    load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
5906    load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5907    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5908    load->dmask = dmask;
5909    load->unrm = true;
5910    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5911    load->barrier = barrier_image;
5912    ctx->block->instructions.emplace_back(std::move(load));
5913
5914    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
5915    return;
5916 }
5917
5918 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
5919 {
5920    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5921    const struct glsl_type *type = glsl_without_array(var->type);
5922    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5923    bool is_array = glsl_sampler_type_is_array(type);
5924    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5925
5926    bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
5927
5928    if (dim == GLSL_SAMPLER_DIM_BUF) {
5929       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5930       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5931       aco_opcode opcode;
5932       switch (data.size()) {
5933       case 1:
5934          opcode = aco_opcode::buffer_store_format_x;
5935          break;
5936       case 2:
5937          opcode = aco_opcode::buffer_store_format_xy;
5938          break;
5939       case 3:
5940          opcode = aco_opcode::buffer_store_format_xyz;
5941          break;
5942       case 4:
5943          opcode = aco_opcode::buffer_store_format_xyzw;
5944          break;
5945       default:
5946          unreachable(">4 channel buffer image store");
5947       }
5948       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
5949       store->operands[0] = Operand(rsrc);
5950       store->operands[1] = Operand(vindex);
5951       store->operands[2] = Operand((uint32_t) 0);
5952       store->operands[3] = Operand(data);
5953       store->idxen = true;
5954       store->glc = glc;
5955       store->dlc = false;
5956       store->disable_wqm = true;
5957       store->barrier = barrier_image;
5958       ctx->program->needs_exact = true;
5959       ctx->block->instructions.emplace_back(std::move(store));
5960       return;
5961    }
5962
5963    assert(data.type() == RegType::vgpr);
5964    Temp coords = get_image_coords(ctx, instr, type);
5965    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5966
5967    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
5968    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
5969
5970    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
5971    store->operands[0] = Operand(resource);
5972    store->operands[1] = Operand(data);
5973    store->operands[2] = Operand(coords);
5974    store->glc = glc;
5975    store->dlc = false;
5976    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5977    store->dmask = (1 << data.size()) - 1;
5978    store->unrm = true;
5979    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5980    store->disable_wqm = true;
5981    store->barrier = barrier_image;
5982    ctx->program->needs_exact = true;
5983    ctx->block->instructions.emplace_back(std::move(store));
5984    return;
5985 }
5986
5987 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
5988 {
5989    /* return the previous value if dest is ever used */
5990    bool return_previous = false;
5991    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5992       return_previous = true;
5993       break;
5994    }
5995    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5996       return_previous = true;
5997       break;
5998    }
5999
6000    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6001    const struct glsl_type *type = glsl_without_array(var->type);
6002    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6003    bool is_array = glsl_sampler_type_is_array(type);
6004    Builder bld(ctx->program, ctx->block);
6005
6006    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6007    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
6008
6009    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6010       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
6011
6012    aco_opcode buf_op, image_op;
6013    switch (instr->intrinsic) {
6014       case nir_intrinsic_image_deref_atomic_add:
6015          buf_op = aco_opcode::buffer_atomic_add;
6016          image_op = aco_opcode::image_atomic_add;
6017          break;
6018       case nir_intrinsic_image_deref_atomic_umin:
6019          buf_op = aco_opcode::buffer_atomic_umin;
6020          image_op = aco_opcode::image_atomic_umin;
6021          break;
6022       case nir_intrinsic_image_deref_atomic_imin:
6023          buf_op = aco_opcode::buffer_atomic_smin;
6024          image_op = aco_opcode::image_atomic_smin;
6025          break;
6026       case nir_intrinsic_image_deref_atomic_umax:
6027          buf_op = aco_opcode::buffer_atomic_umax;
6028          image_op = aco_opcode::image_atomic_umax;
6029          break;
6030       case nir_intrinsic_image_deref_atomic_imax:
6031          buf_op = aco_opcode::buffer_atomic_smax;
6032          image_op = aco_opcode::image_atomic_smax;
6033          break;
6034       case nir_intrinsic_image_deref_atomic_and:
6035          buf_op = aco_opcode::buffer_atomic_and;
6036          image_op = aco_opcode::image_atomic_and;
6037          break;
6038       case nir_intrinsic_image_deref_atomic_or:
6039          buf_op = aco_opcode::buffer_atomic_or;
6040          image_op = aco_opcode::image_atomic_or;
6041          break;
6042       case nir_intrinsic_image_deref_atomic_xor:
6043          buf_op = aco_opcode::buffer_atomic_xor;
6044          image_op = aco_opcode::image_atomic_xor;
6045          break;
6046       case nir_intrinsic_image_deref_atomic_exchange:
6047          buf_op = aco_opcode::buffer_atomic_swap;
6048          image_op = aco_opcode::image_atomic_swap;
6049          break;
6050       case nir_intrinsic_image_deref_atomic_comp_swap:
6051          buf_op = aco_opcode::buffer_atomic_cmpswap;
6052          image_op = aco_opcode::image_atomic_cmpswap;
6053          break;
6054       default:
6055          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
6056    }
6057
6058    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6059
6060    if (dim == GLSL_SAMPLER_DIM_BUF) {
6061       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6062       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
6063       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
6064       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6065       mubuf->operands[0] = Operand(resource);
6066       mubuf->operands[1] = Operand(vindex);
6067       mubuf->operands[2] = Operand((uint32_t)0);
6068       mubuf->operands[3] = Operand(data);
6069       if (return_previous)
6070          mubuf->definitions[0] = Definition(dst);
6071       mubuf->offset = 0;
6072       mubuf->idxen = true;
6073       mubuf->glc = return_previous;
6074       mubuf->dlc = false; /* Not needed for atomics */
6075       mubuf->disable_wqm = true;
6076       mubuf->barrier = barrier_image;
6077       ctx->program->needs_exact = true;
6078       ctx->block->instructions.emplace_back(std::move(mubuf));
6079       return;
6080    }
6081
6082    Temp coords = get_image_coords(ctx, instr, type);
6083    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
6084    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
6085    mimg->operands[0] = Operand(resource);
6086    mimg->operands[1] = Operand(data);
6087    mimg->operands[2] = Operand(coords);
6088    if (return_previous)
6089       mimg->definitions[0] = Definition(dst);
6090    mimg->glc = return_previous;
6091    mimg->dlc = false; /* Not needed for atomics */
6092    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6093    mimg->dmask = (1 << data.size()) - 1;
6094    mimg->unrm = true;
6095    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6096    mimg->disable_wqm = true;
6097    mimg->barrier = barrier_image;
6098    ctx->program->needs_exact = true;
6099    ctx->block->instructions.emplace_back(std::move(mimg));
6100    return;
6101 }
6102
6103 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
6104 {
6105    if (in_elements && ctx->options->chip_class == GFX8) {
6106       /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6107       Builder bld(ctx->program, ctx->block);
6108
6109       Temp size = emit_extract_vector(ctx, desc, 2, s1);
6110
6111       Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
6112       size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
6113
6114       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6115       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
6116
6117       Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
6118       size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6119
6120       Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6121       bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
6122                size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6123       if (dst.type() == RegType::vgpr)
6124          bld.copy(Definition(dst), shr_dst);
6125
6126       /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6127    } else {
6128       emit_extract_vector(ctx, desc, 2, dst);
6129    }
6130 }
6131
6132 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
6133 {
6134    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6135    const struct glsl_type *type = glsl_without_array(var->type);
6136    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6137    bool is_array = glsl_sampler_type_is_array(type);
6138    Builder bld(ctx->program, ctx->block);
6139
6140    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
6141       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
6142       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
6143    }
6144
6145    /* LOD */
6146    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6147
6148    /* Resource */
6149    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
6150
6151    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6152
6153    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
6154    mimg->operands[0] = Operand(resource);
6155    mimg->operands[1] = Operand(s4); /* no sampler */
6156    mimg->operands[2] = Operand(lod);
6157    uint8_t& dmask = mimg->dmask;
6158    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6159    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6160    mimg->da = glsl_sampler_type_is_array(type);
6161    mimg->can_reorder = true;
6162    Definition& def = mimg->definitions[0];
6163    ctx->block->instructions.emplace_back(std::move(mimg));
6164
6165    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
6166        glsl_sampler_type_is_array(type)) {
6167
6168       assert(instr->dest.ssa.num_components == 3);
6169       Temp tmp = {ctx->program->allocateId(), v3};
6170       def = Definition(tmp);
6171       emit_split_vector(ctx, tmp, 3);
6172
6173       /* divide 3rd value by 6 by multiplying with magic number */
6174       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6175       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
6176
6177       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6178                  emit_extract_vector(ctx, tmp, 0, v1),
6179                  emit_extract_vector(ctx, tmp, 1, v1),
6180                  by_6);
6181
6182    } else if (ctx->options->chip_class == GFX9 &&
6183               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
6184               glsl_sampler_type_is_array(type)) {
6185       assert(instr->dest.ssa.num_components == 2);
6186       def = Definition(dst);
6187       dmask = 0x5;
6188    } else {
6189       def = Definition(dst);
6190    }
6191
6192    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6193 }
6194
6195 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6196 {
6197    Builder bld(ctx->program, ctx->block);
6198    unsigned num_components = instr->num_components;
6199
6200    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6201    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6202    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6203
6204    unsigned access = nir_intrinsic_access(instr);
6205    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6206    unsigned size = instr->dest.ssa.bit_size / 8;
6207
6208    uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
6209    /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6210     * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6211     */
6212    bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
6213    allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
6214
6215    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6216                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
6217 }
6218
6219 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6220 {
6221    Builder bld(ctx->program, ctx->block);
6222    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6223    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6224    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6225    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6226
6227    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6228    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6229
6230    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6231    uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
6232    /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6233     * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6234     */
6235    bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
6236
6237    bool smem = !nir_src_is_divergent(instr->src[2]) &&
6238                ctx->options->chip_class >= GFX8 &&
6239                (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
6240                allow_smem;
6241    if (smem)
6242       offset = bld.as_uniform(offset);
6243    bool smem_nonfs = smem && ctx->stage != fragment_fs;
6244
6245    unsigned write_count = 0;
6246    Temp write_datas[32];
6247    unsigned offsets[32];
6248    split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
6249                       data, writemask, 16, &write_count, write_datas, offsets);
6250
6251    for (unsigned i = 0; i < write_count; i++) {
6252       aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
6253       if (smem && ctx->stage == fragment_fs)
6254          op = aco_opcode::p_fs_buffer_store_smem;
6255
6256       if (smem) {
6257          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
6258          store->operands[0] = Operand(rsrc);
6259          if (offsets[i]) {
6260             Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
6261                                       offset, Operand(offsets[i]));
6262             store->operands[1] = Operand(off);
6263          } else {
6264             store->operands[1] = Operand(offset);
6265          }
6266          if (op != aco_opcode::p_fs_buffer_store_smem)
6267             store->operands[1].setFixed(m0);
6268          store->operands[2] = Operand(write_datas[i]);
6269          store->glc = glc;
6270          store->dlc = false;
6271          store->disable_wqm = true;
6272          store->barrier = barrier_buffer;
6273          ctx->block->instructions.emplace_back(std::move(store));
6274          ctx->program->wb_smem_l1_on_end = true;
6275          if (op == aco_opcode::p_fs_buffer_store_smem) {
6276             ctx->block->kind |= block_kind_needs_lowering;
6277             ctx->program->needs_exact = true;
6278          }
6279       } else {
6280          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6281          store->operands[0] = Operand(rsrc);
6282          store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6283          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6284          store->operands[3] = Operand(write_datas[i]);
6285          store->offset = offsets[i];
6286          store->offen = (offset.type() == RegType::vgpr);
6287          store->glc = glc;
6288          store->dlc = false;
6289          store->disable_wqm = true;
6290          store->barrier = barrier_buffer;
6291          ctx->program->needs_exact = true;
6292          ctx->block->instructions.emplace_back(std::move(store));
6293       }
6294    }
6295 }
6296
6297 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6298 {
6299    /* return the previous value if dest is ever used */
6300    bool return_previous = false;
6301    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6302       return_previous = true;
6303       break;
6304    }
6305    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6306       return_previous = true;
6307       break;
6308    }
6309
6310    Builder bld(ctx->program, ctx->block);
6311    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6312
6313    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6314       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6315                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6316
6317    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6318    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6319    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6320
6321    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6322
6323    aco_opcode op32, op64;
6324    switch (instr->intrinsic) {
6325       case nir_intrinsic_ssbo_atomic_add:
6326          op32 = aco_opcode::buffer_atomic_add;
6327          op64 = aco_opcode::buffer_atomic_add_x2;
6328          break;
6329       case nir_intrinsic_ssbo_atomic_imin:
6330          op32 = aco_opcode::buffer_atomic_smin;
6331          op64 = aco_opcode::buffer_atomic_smin_x2;
6332          break;
6333       case nir_intrinsic_ssbo_atomic_umin:
6334          op32 = aco_opcode::buffer_atomic_umin;
6335          op64 = aco_opcode::buffer_atomic_umin_x2;
6336          break;
6337       case nir_intrinsic_ssbo_atomic_imax:
6338          op32 = aco_opcode::buffer_atomic_smax;
6339          op64 = aco_opcode::buffer_atomic_smax_x2;
6340          break;
6341       case nir_intrinsic_ssbo_atomic_umax:
6342          op32 = aco_opcode::buffer_atomic_umax;
6343          op64 = aco_opcode::buffer_atomic_umax_x2;
6344          break;
6345       case nir_intrinsic_ssbo_atomic_and:
6346          op32 = aco_opcode::buffer_atomic_and;
6347          op64 = aco_opcode::buffer_atomic_and_x2;
6348          break;
6349       case nir_intrinsic_ssbo_atomic_or:
6350          op32 = aco_opcode::buffer_atomic_or;
6351          op64 = aco_opcode::buffer_atomic_or_x2;
6352          break;
6353       case nir_intrinsic_ssbo_atomic_xor:
6354          op32 = aco_opcode::buffer_atomic_xor;
6355          op64 = aco_opcode::buffer_atomic_xor_x2;
6356          break;
6357       case nir_intrinsic_ssbo_atomic_exchange:
6358          op32 = aco_opcode::buffer_atomic_swap;
6359          op64 = aco_opcode::buffer_atomic_swap_x2;
6360          break;
6361       case nir_intrinsic_ssbo_atomic_comp_swap:
6362          op32 = aco_opcode::buffer_atomic_cmpswap;
6363          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6364          break;
6365       default:
6366          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6367    }
6368    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6369    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6370    mubuf->operands[0] = Operand(rsrc);
6371    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6372    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6373    mubuf->operands[3] = Operand(data);
6374    if (return_previous)
6375       mubuf->definitions[0] = Definition(dst);
6376    mubuf->offset = 0;
6377    mubuf->offen = (offset.type() == RegType::vgpr);
6378    mubuf->glc = return_previous;
6379    mubuf->dlc = false; /* Not needed for atomics */
6380    mubuf->disable_wqm = true;
6381    mubuf->barrier = barrier_buffer;
6382    ctx->program->needs_exact = true;
6383    ctx->block->instructions.emplace_back(std::move(mubuf));
6384 }
6385
6386 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
6387
6388    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6389    Builder bld(ctx->program, ctx->block);
6390    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
6391    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
6392 }
6393
6394 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
6395 {
6396    Builder bld(ctx->program, ctx->block);
6397    unsigned num_components = instr->num_components;
6398    unsigned component_size = instr->dest.ssa.bit_size / 8;
6399
6400    LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6401                         get_ssa_temp(ctx, &instr->dest.ssa),
6402                         num_components, component_size};
6403    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6404    info.align_mul = nir_intrinsic_align_mul(instr);
6405    info.align_offset = nir_intrinsic_align_offset(instr);
6406    info.barrier = barrier_buffer;
6407    info.can_reorder = false;
6408    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6409     * it's safe to use SMEM */
6410    bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6411    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
6412       emit_global_load(ctx, bld, &info);
6413    } else {
6414       info.offset = Operand(bld.as_uniform(info.offset));
6415       emit_smem_load(ctx, bld, &info);
6416    }
6417 }
6418
6419 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
6420 {
6421    Builder bld(ctx->program, ctx->block);
6422    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6423    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6424
6425    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6426    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6427    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6428
6429    if (ctx->options->chip_class >= GFX7)
6430       addr = as_vgpr(ctx, addr);
6431
6432    unsigned write_count = 0;
6433    Temp write_datas[32];
6434    unsigned offsets[32];
6435    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6436                       16, &write_count, write_datas, offsets);
6437
6438    for (unsigned i = 0; i < write_count; i++) {
6439       if (ctx->options->chip_class >= GFX7) {
6440          unsigned offset = offsets[i];
6441          Temp store_addr = addr;
6442          if (offset > 0 && ctx->options->chip_class < GFX9) {
6443             Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6444             Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6445             Temp carry = bld.tmp(bld.lm);
6446             bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6447
6448             bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
6449                      Operand(offset), addr0);
6450             bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6451                      Operand(0u), addr1,
6452                      carry).def(1).setHint(vcc);
6453
6454             store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6455
6456             offset = 0;
6457          }
6458
6459          bool global = ctx->options->chip_class >= GFX9;
6460          aco_opcode op;
6461          switch (write_datas[i].bytes()) {
6462          case 1:
6463             op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
6464             break;
6465          case 2:
6466             op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
6467             break;
6468          case 4:
6469             op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
6470             break;
6471          case 8:
6472             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6473             break;
6474          case 12:
6475             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6476             break;
6477          case 16:
6478             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6479             break;
6480          default:
6481             unreachable("store_global not implemented for this size.");
6482          }
6483
6484          aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6485          flat->operands[0] = Operand(store_addr);
6486          flat->operands[1] = Operand(s1);
6487          flat->operands[2] = Operand(write_datas[i]);
6488          flat->glc = glc;
6489          flat->dlc = false;
6490          flat->offset = offset;
6491          flat->disable_wqm = true;
6492          flat->barrier = barrier_buffer;
6493          ctx->program->needs_exact = true;
6494          ctx->block->instructions.emplace_back(std::move(flat));
6495       } else {
6496          assert(ctx->options->chip_class == GFX6);
6497
6498          aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6499
6500          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6501
6502          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6503          mubuf->operands[0] = Operand(rsrc);
6504          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6505          mubuf->operands[2] = Operand(0u);
6506          mubuf->operands[3] = Operand(write_datas[i]);
6507          mubuf->glc = glc;
6508          mubuf->dlc = false;
6509          mubuf->offset = offsets[i];
6510          mubuf->addr64 = addr.type() == RegType::vgpr;
6511          mubuf->disable_wqm = true;
6512          mubuf->barrier = barrier_buffer;
6513          ctx->program->needs_exact = true;
6514          ctx->block->instructions.emplace_back(std::move(mubuf));
6515       }
6516    }
6517 }
6518
6519 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6520 {
6521    /* return the previous value if dest is ever used */
6522    bool return_previous = false;
6523    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6524       return_previous = true;
6525       break;
6526    }
6527    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6528       return_previous = true;
6529       break;
6530    }
6531
6532    Builder bld(ctx->program, ctx->block);
6533    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6534    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6535
6536    if (ctx->options->chip_class >= GFX7)
6537       addr = as_vgpr(ctx, addr);
6538
6539    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6540       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6541                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6542
6543    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6544
6545    aco_opcode op32, op64;
6546
6547    if (ctx->options->chip_class >= GFX7) {
6548       bool global = ctx->options->chip_class >= GFX9;
6549       switch (instr->intrinsic) {
6550          case nir_intrinsic_global_atomic_add:
6551             op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6552             op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6553             break;
6554          case nir_intrinsic_global_atomic_imin:
6555             op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6556             op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6557             break;
6558          case nir_intrinsic_global_atomic_umin:
6559             op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6560             op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6561             break;
6562          case nir_intrinsic_global_atomic_imax:
6563             op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6564             op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6565             break;
6566          case nir_intrinsic_global_atomic_umax:
6567             op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6568             op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6569             break;
6570          case nir_intrinsic_global_atomic_and:
6571             op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6572             op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6573             break;
6574          case nir_intrinsic_global_atomic_or:
6575             op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6576             op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6577             break;
6578          case nir_intrinsic_global_atomic_xor:
6579             op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6580             op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6581             break;
6582          case nir_intrinsic_global_atomic_exchange:
6583             op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6584             op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6585             break;
6586          case nir_intrinsic_global_atomic_comp_swap:
6587             op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6588             op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6589             break;
6590          default:
6591             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6592       }
6593
6594       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6595       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6596       flat->operands[0] = Operand(addr);
6597       flat->operands[1] = Operand(s1);
6598       flat->operands[2] = Operand(data);
6599       if (return_previous)
6600          flat->definitions[0] = Definition(dst);
6601       flat->glc = return_previous;
6602       flat->dlc = false; /* Not needed for atomics */
6603       flat->offset = 0;
6604       flat->disable_wqm = true;
6605       flat->barrier = barrier_buffer;
6606       ctx->program->needs_exact = true;
6607       ctx->block->instructions.emplace_back(std::move(flat));
6608    } else {
6609       assert(ctx->options->chip_class == GFX6);
6610
6611       switch (instr->intrinsic) {
6612          case nir_intrinsic_global_atomic_add:
6613             op32 = aco_opcode::buffer_atomic_add;
6614             op64 = aco_opcode::buffer_atomic_add_x2;
6615             break;
6616          case nir_intrinsic_global_atomic_imin:
6617             op32 = aco_opcode::buffer_atomic_smin;
6618             op64 = aco_opcode::buffer_atomic_smin_x2;
6619             break;
6620          case nir_intrinsic_global_atomic_umin:
6621             op32 = aco_opcode::buffer_atomic_umin;
6622             op64 = aco_opcode::buffer_atomic_umin_x2;
6623             break;
6624          case nir_intrinsic_global_atomic_imax:
6625             op32 = aco_opcode::buffer_atomic_smax;
6626             op64 = aco_opcode::buffer_atomic_smax_x2;
6627             break;
6628          case nir_intrinsic_global_atomic_umax:
6629             op32 = aco_opcode::buffer_atomic_umax;
6630             op64 = aco_opcode::buffer_atomic_umax_x2;
6631             break;
6632          case nir_intrinsic_global_atomic_and:
6633             op32 = aco_opcode::buffer_atomic_and;
6634             op64 = aco_opcode::buffer_atomic_and_x2;
6635             break;
6636          case nir_intrinsic_global_atomic_or:
6637             op32 = aco_opcode::buffer_atomic_or;
6638             op64 = aco_opcode::buffer_atomic_or_x2;
6639             break;
6640          case nir_intrinsic_global_atomic_xor:
6641             op32 = aco_opcode::buffer_atomic_xor;
6642             op64 = aco_opcode::buffer_atomic_xor_x2;
6643             break;
6644          case nir_intrinsic_global_atomic_exchange:
6645             op32 = aco_opcode::buffer_atomic_swap;
6646             op64 = aco_opcode::buffer_atomic_swap_x2;
6647             break;
6648          case nir_intrinsic_global_atomic_comp_swap:
6649             op32 = aco_opcode::buffer_atomic_cmpswap;
6650             op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6651             break;
6652          default:
6653             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6654       }
6655
6656       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6657
6658       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6659
6660       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6661       mubuf->operands[0] = Operand(rsrc);
6662       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6663       mubuf->operands[2] = Operand(0u);
6664       mubuf->operands[3] = Operand(data);
6665       if (return_previous)
6666          mubuf->definitions[0] = Definition(dst);
6667       mubuf->glc = return_previous;
6668       mubuf->dlc = false;
6669       mubuf->offset = 0;
6670       mubuf->addr64 = addr.type() == RegType::vgpr;
6671       mubuf->disable_wqm = true;
6672       mubuf->barrier = barrier_buffer;
6673       ctx->program->needs_exact = true;
6674       ctx->block->instructions.emplace_back(std::move(mubuf));
6675    }
6676 }
6677
6678 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
6679    Builder bld(ctx->program, ctx->block);
6680    switch(instr->intrinsic) {
6681       case nir_intrinsic_group_memory_barrier:
6682       case nir_intrinsic_memory_barrier:
6683          bld.barrier(aco_opcode::p_memory_barrier_common);
6684          break;
6685       case nir_intrinsic_memory_barrier_buffer:
6686          bld.barrier(aco_opcode::p_memory_barrier_buffer);
6687          break;
6688       case nir_intrinsic_memory_barrier_image:
6689          bld.barrier(aco_opcode::p_memory_barrier_image);
6690          break;
6691       case nir_intrinsic_memory_barrier_tcs_patch:
6692       case nir_intrinsic_memory_barrier_shared:
6693          bld.barrier(aco_opcode::p_memory_barrier_shared);
6694          break;
6695       default:
6696          unreachable("Unimplemented memory barrier intrinsic");
6697          break;
6698    }
6699 }
6700
6701 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6702 {
6703    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
6704    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6705    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6706    Builder bld(ctx->program, ctx->block);
6707
6708    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
6709    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6710    load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
6711 }
6712
6713 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6714 {
6715    unsigned writemask = nir_intrinsic_write_mask(instr);
6716    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6717    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6718    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6719
6720    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6721    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
6722 }
6723
6724 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6725 {
6726    unsigned offset = nir_intrinsic_base(instr);
6727    Builder bld(ctx->program, ctx->block);
6728    Operand m = load_lds_size_m0(bld);
6729    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6730    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6731
6732    unsigned num_operands = 3;
6733    aco_opcode op32, op64, op32_rtn, op64_rtn;
6734    switch(instr->intrinsic) {
6735       case nir_intrinsic_shared_atomic_add:
6736          op32 = aco_opcode::ds_add_u32;
6737          op64 = aco_opcode::ds_add_u64;
6738          op32_rtn = aco_opcode::ds_add_rtn_u32;
6739          op64_rtn = aco_opcode::ds_add_rtn_u64;
6740          break;
6741       case nir_intrinsic_shared_atomic_imin:
6742          op32 = aco_opcode::ds_min_i32;
6743          op64 = aco_opcode::ds_min_i64;
6744          op32_rtn = aco_opcode::ds_min_rtn_i32;
6745          op64_rtn = aco_opcode::ds_min_rtn_i64;
6746          break;
6747       case nir_intrinsic_shared_atomic_umin:
6748          op32 = aco_opcode::ds_min_u32;
6749          op64 = aco_opcode::ds_min_u64;
6750          op32_rtn = aco_opcode::ds_min_rtn_u32;
6751          op64_rtn = aco_opcode::ds_min_rtn_u64;
6752          break;
6753       case nir_intrinsic_shared_atomic_imax:
6754          op32 = aco_opcode::ds_max_i32;
6755          op64 = aco_opcode::ds_max_i64;
6756          op32_rtn = aco_opcode::ds_max_rtn_i32;
6757          op64_rtn = aco_opcode::ds_max_rtn_i64;
6758          break;
6759       case nir_intrinsic_shared_atomic_umax:
6760          op32 = aco_opcode::ds_max_u32;
6761          op64 = aco_opcode::ds_max_u64;
6762          op32_rtn = aco_opcode::ds_max_rtn_u32;
6763          op64_rtn = aco_opcode::ds_max_rtn_u64;
6764          break;
6765       case nir_intrinsic_shared_atomic_and:
6766          op32 = aco_opcode::ds_and_b32;
6767          op64 = aco_opcode::ds_and_b64;
6768          op32_rtn = aco_opcode::ds_and_rtn_b32;
6769          op64_rtn = aco_opcode::ds_and_rtn_b64;
6770          break;
6771       case nir_intrinsic_shared_atomic_or:
6772          op32 = aco_opcode::ds_or_b32;
6773          op64 = aco_opcode::ds_or_b64;
6774          op32_rtn = aco_opcode::ds_or_rtn_b32;
6775          op64_rtn = aco_opcode::ds_or_rtn_b64;
6776          break;
6777       case nir_intrinsic_shared_atomic_xor:
6778          op32 = aco_opcode::ds_xor_b32;
6779          op64 = aco_opcode::ds_xor_b64;
6780          op32_rtn = aco_opcode::ds_xor_rtn_b32;
6781          op64_rtn = aco_opcode::ds_xor_rtn_b64;
6782          break;
6783       case nir_intrinsic_shared_atomic_exchange:
6784          op32 = aco_opcode::ds_write_b32;
6785          op64 = aco_opcode::ds_write_b64;
6786          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
6787          op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
6788          break;
6789       case nir_intrinsic_shared_atomic_comp_swap:
6790          op32 = aco_opcode::ds_cmpst_b32;
6791          op64 = aco_opcode::ds_cmpst_b64;
6792          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
6793          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
6794          num_operands = 4;
6795          break;
6796       default:
6797          unreachable("Unhandled shared atomic intrinsic");
6798    }
6799
6800    /* return the previous value if dest is ever used */
6801    bool return_previous = false;
6802    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6803       return_previous = true;
6804       break;
6805    }
6806    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6807       return_previous = true;
6808       break;
6809    }
6810
6811    aco_opcode op;
6812    if (data.size() == 1) {
6813       assert(instr->dest.ssa.bit_size == 32);
6814       op = return_previous ? op32_rtn : op32;
6815    } else {
6816       assert(instr->dest.ssa.bit_size == 64);
6817       op = return_previous ? op64_rtn : op64;
6818    }
6819
6820    if (offset > 65535) {
6821       address = bld.vadd32(bld.def(v1), Operand(offset), address);
6822       offset = 0;
6823    }
6824
6825    aco_ptr<DS_instruction> ds;
6826    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
6827    ds->operands[0] = Operand(address);
6828    ds->operands[1] = Operand(data);
6829    if (num_operands == 4)
6830       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
6831    ds->operands[num_operands - 1] = m;
6832    ds->offset0 = offset;
6833    if (return_previous)
6834       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
6835    ctx->block->instructions.emplace_back(std::move(ds));
6836 }
6837
6838 Temp get_scratch_resource(isel_context *ctx)
6839 {
6840    Builder bld(ctx->program, ctx->block);
6841    Temp scratch_addr = ctx->program->private_segment_buffer;
6842    if (ctx->stage != compute_cs)
6843       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
6844
6845    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
6846                         S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
6847
6848    if (ctx->program->chip_class >= GFX10) {
6849       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
6850                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
6851                    S_008F0C_RESOURCE_LEVEL(1);
6852    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
6853       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6854                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6855    }
6856
6857    /* older generations need element size = 16 bytes. element size removed in GFX9 */
6858    if (ctx->program->chip_class <= GFX8)
6859       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
6860
6861    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
6862 }
6863
6864 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6865    Builder bld(ctx->program, ctx->block);
6866    Temp rsrc = get_scratch_resource(ctx);
6867    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6868    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6869
6870    LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
6871                         instr->dest.ssa.bit_size / 8u, rsrc};
6872    info.align_mul = nir_intrinsic_align_mul(instr);
6873    info.align_offset = nir_intrinsic_align_offset(instr);
6874    info.swizzle_component_size = 16;
6875    info.can_reorder = false;
6876    info.soffset = ctx->program->scratch_offset;
6877    emit_mubuf_load(ctx, bld, &info);
6878 }
6879
6880 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6881    Builder bld(ctx->program, ctx->block);
6882    Temp rsrc = get_scratch_resource(ctx);
6883    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6884    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6885
6886    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6887    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6888
6889    unsigned write_count = 0;
6890    Temp write_datas[32];
6891    unsigned offsets[32];
6892    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6893                       16, &write_count, write_datas, offsets);
6894
6895    for (unsigned i = 0; i < write_count; i++) {
6896       aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6897       bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
6898    }
6899 }
6900
6901 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
6902    uint8_t log2_ps_iter_samples;
6903    if (ctx->program->info->ps.force_persample) {
6904       log2_ps_iter_samples =
6905          util_logbase2(ctx->options->key.fs.num_samples);
6906    } else {
6907       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
6908    }
6909
6910    /* The bit pattern matches that used by fixed function fragment
6911     * processing. */
6912    static const unsigned ps_iter_masks[] = {
6913       0xffff, /* not used */
6914       0x5555,
6915       0x1111,
6916       0x0101,
6917       0x0001,
6918    };
6919    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
6920
6921    Builder bld(ctx->program, ctx->block);
6922
6923    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
6924                              get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
6925    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
6926    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
6927    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6928    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
6929 }
6930
6931 void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
6932    Builder bld(ctx->program, ctx->block);
6933
6934    unsigned stream = nir_intrinsic_stream_id(instr);
6935    Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6936    next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
6937    nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
6938
6939    /* get GSVS ring */
6940    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
6941
6942    unsigned num_components =
6943       ctx->program->info->gs.num_stream_output_components[stream];
6944    assert(num_components);
6945
6946    unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
6947    unsigned stream_offset = 0;
6948    for (unsigned i = 0; i < stream; i++) {
6949       unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
6950       stream_offset += prev_stride * ctx->program->wave_size;
6951    }
6952
6953    /* Limit on the stride field for <= GFX7. */
6954    assert(stride < (1 << 14));
6955
6956    Temp gsvs_dwords[4];
6957    for (unsigned i = 0; i < 4; i++)
6958       gsvs_dwords[i] = bld.tmp(s1);
6959    bld.pseudo(aco_opcode::p_split_vector,
6960               Definition(gsvs_dwords[0]),
6961               Definition(gsvs_dwords[1]),
6962               Definition(gsvs_dwords[2]),
6963               Definition(gsvs_dwords[3]),
6964               gsvs_ring);
6965
6966    if (stream_offset) {
6967       Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
6968
6969       Temp carry = bld.tmp(s1);
6970       gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
6971       gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
6972    }
6973
6974    gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
6975    gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
6976
6977    gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6978                           gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
6979
6980    unsigned offset = 0;
6981    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
6982       if (ctx->program->info->gs.output_streams[i] != stream)
6983          continue;
6984
6985       for (unsigned j = 0; j < 4; j++) {
6986          if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
6987             continue;
6988
6989          if (ctx->outputs.mask[i] & (1 << j)) {
6990             Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
6991             unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
6992             if (const_offset >= 4096u) {
6993                if (vaddr_offset.isUndefined())
6994                   vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
6995                else
6996                   vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
6997                const_offset %= 4096u;
6998             }
6999
7000             aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7001             mtbuf->operands[0] = Operand(gsvs_ring);
7002             mtbuf->operands[1] = vaddr_offset;
7003             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
7004             mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7005             mtbuf->offen = !vaddr_offset.isUndefined();
7006             mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7007             mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7008             mtbuf->offset = const_offset;
7009             mtbuf->glc = true;
7010             mtbuf->slc = true;
7011             mtbuf->barrier = barrier_gs_data;
7012             mtbuf->can_reorder = true;
7013             bld.insert(std::move(mtbuf));
7014          }
7015
7016          offset += ctx->shader->info.gs.vertices_out;
7017       }
7018
7019       /* outputs for the next vertex are undefined and keeping them around can
7020        * create invalid IR with control flow */
7021       ctx->outputs.mask[i] = 0;
7022    }
7023
7024    bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7025 }
7026
7027 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
7028 {
7029    Builder bld(ctx->program, ctx->block);
7030
7031    if (cluster_size == 1) {
7032       return src;
7033    } if (op == nir_op_iand && cluster_size == 4) {
7034       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
7035       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7036       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7037                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7038    } else if (op == nir_op_ior && cluster_size == 4) {
7039       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
7040       return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7041                       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7042    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7043       //subgroupAnd(val) -> (exec & ~val) == 0
7044       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7045       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7046       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7047    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7048       //subgroupOr(val) -> (val & exec) != 0
7049       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
7050       return bool_to_vector_condition(ctx, tmp);
7051    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7052       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
7053       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7054       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7055       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
7056       return bool_to_vector_condition(ctx, tmp);
7057    } else {
7058       //subgroupClustered{And,Or,Xor}(val, n) ->
7059       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ;  just v_mbcnt_lo_u32_b32 on wave32
7060       //cluster_offset = ~(n - 1) & lane_id
7061       //cluster_mask = ((1 << n) - 1)
7062       //subgroupClusteredAnd():
7063       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7064       //subgroupClusteredOr():
7065       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7066       //subgroupClusteredXor():
7067       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7068       Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
7069       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
7070
7071       Temp tmp;
7072       if (op == nir_op_iand)
7073          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7074       else
7075          tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7076
7077       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7078
7079       if (ctx->program->chip_class <= GFX7)
7080          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7081       else if (ctx->program->wave_size == 64)
7082          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7083       else
7084          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7085       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7086       if (cluster_mask != 0xffffffff)
7087          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
7088
7089       Definition cmp_def = Definition();
7090       if (op == nir_op_iand) {
7091          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
7092       } else if (op == nir_op_ior) {
7093          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7094       } else if (op == nir_op_ixor) {
7095          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
7096                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
7097          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7098       }
7099       cmp_def.setHint(vcc);
7100       return cmp_def.getTemp();
7101    }
7102 }
7103
7104 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
7105 {
7106    Builder bld(ctx->program, ctx->block);
7107
7108    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7109    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7110    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7111    Temp tmp;
7112    if (op == nir_op_iand)
7113       tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7114    else
7115       tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
7116
7117    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
7118    Temp lo = lohi.def(0).getTemp();
7119    Temp hi = lohi.def(1).getTemp();
7120    Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi));
7121
7122    Definition cmp_def = Definition();
7123    if (op == nir_op_iand)
7124       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7125    else if (op == nir_op_ior)
7126       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7127    else if (op == nir_op_ixor)
7128       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
7129                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
7130    cmp_def.setHint(vcc);
7131    return cmp_def.getTemp();
7132 }
7133
7134 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
7135 {
7136    Builder bld(ctx->program, ctx->block);
7137
7138    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7139    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7140    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7141    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7142    if (op == nir_op_iand)
7143       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7144    else if (op == nir_op_ior)
7145       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7146    else if (op == nir_op_ixor)
7147       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7148
7149    assert(false);
7150    return Temp();
7151 }
7152
7153 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
7154 {
7155    Builder bld(ctx->program, ctx->block);
7156    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7157    if (src.regClass().type() == RegType::vgpr) {
7158       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7159    } else if (src.regClass() == s1) {
7160       bld.sop1(aco_opcode::s_mov_b32, dst, src);
7161    } else if (src.regClass() == s2) {
7162       bld.sop1(aco_opcode::s_mov_b64, dst, src);
7163    } else {
7164       fprintf(stderr, "Unimplemented NIR instr bit size: ");
7165       nir_print_instr(&instr->instr, stderr);
7166       fprintf(stderr, "\n");
7167    }
7168 }
7169
7170 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
7171 {
7172    Builder bld(ctx->program, ctx->block);
7173    Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
7174    Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
7175    Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
7176
7177    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7178    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7179    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7180    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7181
7182    /* Build DD X/Y */
7183    if (ctx->program->chip_class >= GFX8) {
7184       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7185       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7186       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7187       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7188       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7189       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7190    } else {
7191       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7192       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7193       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7194       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7195       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7196       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7197       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7198       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7199       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7200       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7201    }
7202
7203    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7204    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
7205    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
7206    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
7207    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
7208    Temp wqm1 = bld.tmp(v1);
7209    emit_wqm(ctx, tmp1, wqm1, true);
7210    Temp wqm2 = bld.tmp(v1);
7211    emit_wqm(ctx, tmp2, wqm2, true);
7212    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7213    return;
7214 }
7215
7216 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
7217 {
7218    Builder bld(ctx->program, ctx->block);
7219    switch(instr->intrinsic) {
7220    case nir_intrinsic_load_barycentric_sample:
7221    case nir_intrinsic_load_barycentric_pixel:
7222    case nir_intrinsic_load_barycentric_centroid: {
7223       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7224       Temp bary = Temp(0, s2);
7225       switch (mode) {
7226       case INTERP_MODE_SMOOTH:
7227       case INTERP_MODE_NONE:
7228          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7229             bary = get_arg(ctx, ctx->args->ac.persp_center);
7230          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7231             bary = ctx->persp_centroid;
7232          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7233             bary = get_arg(ctx, ctx->args->ac.persp_sample);
7234          break;
7235       case INTERP_MODE_NOPERSPECTIVE:
7236          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7237             bary = get_arg(ctx, ctx->args->ac.linear_center);
7238          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7239             bary = ctx->linear_centroid;
7240          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7241             bary = get_arg(ctx, ctx->args->ac.linear_sample);
7242          break;
7243       default:
7244          break;
7245       }
7246       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7247       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7248       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7249       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7250                  Operand(p1), Operand(p2));
7251       emit_split_vector(ctx, dst, 2);
7252       break;
7253    }
7254    case nir_intrinsic_load_barycentric_model: {
7255       Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7256
7257       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7258       Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7259       Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7260       Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7261       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7262                  Operand(p1), Operand(p2), Operand(p3));
7263       emit_split_vector(ctx, dst, 3);
7264       break;
7265    }
7266    case nir_intrinsic_load_barycentric_at_sample: {
7267       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7268       switch (ctx->options->key.fs.num_samples) {
7269          case 2: sample_pos_offset += 1 << 3; break;
7270          case 4: sample_pos_offset += 3 << 3; break;
7271          case 8: sample_pos_offset += 7 << 3; break;
7272          default: break;
7273       }
7274       Temp sample_pos;
7275       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7276       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7277       Temp private_segment_buffer = ctx->program->private_segment_buffer;
7278       //TODO: bounds checking?
7279       if (addr.type() == RegType::sgpr) {
7280          Operand offset;
7281          if (const_addr) {
7282             sample_pos_offset += const_addr->u32 << 3;
7283             offset = Operand(sample_pos_offset);
7284          } else if (ctx->options->chip_class >= GFX9) {
7285             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7286          } else {
7287             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
7288             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7289          }
7290
7291          Operand off = bld.copy(bld.def(s1), Operand(offset));
7292          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7293
7294       } else if (ctx->options->chip_class >= GFX9) {
7295          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7296          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
7297       } else if (ctx->options->chip_class >= GFX7) {
7298          /* addr += private_segment_buffer + sample_pos_offset */
7299          Temp tmp0 = bld.tmp(s1);
7300          Temp tmp1 = bld.tmp(s1);
7301          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
7302          Definition scc_tmp = bld.def(s1, scc);
7303          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
7304          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
7305          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7306          Temp pck0 = bld.tmp(v1);
7307          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7308          tmp1 = as_vgpr(ctx, tmp1);
7309          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
7310          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7311
7312          /* sample_pos = flat_load_dwordx2 addr */
7313          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
7314       } else {
7315          assert(ctx->options->chip_class == GFX6);
7316
7317          uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7318                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7319          Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
7320
7321          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7322          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
7323
7324          sample_pos = bld.tmp(v2);
7325
7326          aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
7327          load->definitions[0] = Definition(sample_pos);
7328          load->operands[0] = Operand(rsrc);
7329          load->operands[1] = Operand(addr);
7330          load->operands[2] = Operand(0u);
7331          load->offset = sample_pos_offset;
7332          load->offen = 0;
7333          load->addr64 = true;
7334          load->glc = false;
7335          load->dlc = false;
7336          load->disable_wqm = false;
7337          load->barrier = barrier_none;
7338          load->can_reorder = true;
7339          ctx->block->instructions.emplace_back(std::move(load));
7340       }
7341
7342       /* sample_pos -= 0.5 */
7343       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
7344       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
7345       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
7346       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
7347       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
7348
7349       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7350       break;
7351    }
7352    case nir_intrinsic_load_barycentric_at_offset: {
7353       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
7354       RegClass rc = RegClass(offset.type(), 1);
7355       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
7356       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
7357       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7358       break;
7359    }
7360    case nir_intrinsic_load_front_face: {
7361       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7362                Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
7363       break;
7364    }
7365    case nir_intrinsic_load_view_index: {
7366       if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
7367          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7368          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
7369          break;
7370       }
7371
7372       /* fallthrough */
7373    }
7374    case nir_intrinsic_load_layer_id: {
7375       unsigned idx = nir_intrinsic_base(instr);
7376       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7377                  Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
7378       break;
7379    }
7380    case nir_intrinsic_load_frag_coord: {
7381       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
7382       break;
7383    }
7384    case nir_intrinsic_load_sample_pos: {
7385       Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
7386       Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
7387       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7388                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
7389                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
7390       break;
7391    }
7392    case nir_intrinsic_load_tess_coord:
7393       visit_load_tess_coord(ctx, instr);
7394       break;
7395    case nir_intrinsic_load_interpolated_input:
7396       visit_load_interpolated_input(ctx, instr);
7397       break;
7398    case nir_intrinsic_store_output:
7399       visit_store_output(ctx, instr);
7400       break;
7401    case nir_intrinsic_load_input:
7402    case nir_intrinsic_load_input_vertex:
7403       visit_load_input(ctx, instr);
7404       break;
7405    case nir_intrinsic_load_output:
7406       visit_load_output(ctx, instr);
7407       break;
7408    case nir_intrinsic_load_per_vertex_input:
7409       visit_load_per_vertex_input(ctx, instr);
7410       break;
7411    case nir_intrinsic_load_per_vertex_output:
7412       visit_load_per_vertex_output(ctx, instr);
7413       break;
7414    case nir_intrinsic_store_per_vertex_output:
7415       visit_store_per_vertex_output(ctx, instr);
7416       break;
7417    case nir_intrinsic_load_ubo:
7418       visit_load_ubo(ctx, instr);
7419       break;
7420    case nir_intrinsic_load_push_constant:
7421       visit_load_push_constant(ctx, instr);
7422       break;
7423    case nir_intrinsic_load_constant:
7424       visit_load_constant(ctx, instr);
7425       break;
7426    case nir_intrinsic_vulkan_resource_index:
7427       visit_load_resource(ctx, instr);
7428       break;
7429    case nir_intrinsic_discard:
7430       visit_discard(ctx, instr);
7431       break;
7432    case nir_intrinsic_discard_if:
7433       visit_discard_if(ctx, instr);
7434       break;
7435    case nir_intrinsic_load_shared:
7436       visit_load_shared(ctx, instr);
7437       break;
7438    case nir_intrinsic_store_shared:
7439       visit_store_shared(ctx, instr);
7440       break;
7441    case nir_intrinsic_shared_atomic_add:
7442    case nir_intrinsic_shared_atomic_imin:
7443    case nir_intrinsic_shared_atomic_umin:
7444    case nir_intrinsic_shared_atomic_imax:
7445    case nir_intrinsic_shared_atomic_umax:
7446    case nir_intrinsic_shared_atomic_and:
7447    case nir_intrinsic_shared_atomic_or:
7448    case nir_intrinsic_shared_atomic_xor:
7449    case nir_intrinsic_shared_atomic_exchange:
7450    case nir_intrinsic_shared_atomic_comp_swap:
7451       visit_shared_atomic(ctx, instr);
7452       break;
7453    case nir_intrinsic_image_deref_load:
7454       visit_image_load(ctx, instr);
7455       break;
7456    case nir_intrinsic_image_deref_store:
7457       visit_image_store(ctx, instr);
7458       break;
7459    case nir_intrinsic_image_deref_atomic_add:
7460    case nir_intrinsic_image_deref_atomic_umin:
7461    case nir_intrinsic_image_deref_atomic_imin:
7462    case nir_intrinsic_image_deref_atomic_umax:
7463    case nir_intrinsic_image_deref_atomic_imax:
7464    case nir_intrinsic_image_deref_atomic_and:
7465    case nir_intrinsic_image_deref_atomic_or:
7466    case nir_intrinsic_image_deref_atomic_xor:
7467    case nir_intrinsic_image_deref_atomic_exchange:
7468    case nir_intrinsic_image_deref_atomic_comp_swap:
7469       visit_image_atomic(ctx, instr);
7470       break;
7471    case nir_intrinsic_image_deref_size:
7472       visit_image_size(ctx, instr);
7473       break;
7474    case nir_intrinsic_load_ssbo:
7475       visit_load_ssbo(ctx, instr);
7476       break;
7477    case nir_intrinsic_store_ssbo:
7478       visit_store_ssbo(ctx, instr);
7479       break;
7480    case nir_intrinsic_load_global:
7481       visit_load_global(ctx, instr);
7482       break;
7483    case nir_intrinsic_store_global:
7484       visit_store_global(ctx, instr);
7485       break;
7486    case nir_intrinsic_global_atomic_add:
7487    case nir_intrinsic_global_atomic_imin:
7488    case nir_intrinsic_global_atomic_umin:
7489    case nir_intrinsic_global_atomic_imax:
7490    case nir_intrinsic_global_atomic_umax:
7491    case nir_intrinsic_global_atomic_and:
7492    case nir_intrinsic_global_atomic_or:
7493    case nir_intrinsic_global_atomic_xor:
7494    case nir_intrinsic_global_atomic_exchange:
7495    case nir_intrinsic_global_atomic_comp_swap:
7496       visit_global_atomic(ctx, instr);
7497       break;
7498    case nir_intrinsic_ssbo_atomic_add:
7499    case nir_intrinsic_ssbo_atomic_imin:
7500    case nir_intrinsic_ssbo_atomic_umin:
7501    case nir_intrinsic_ssbo_atomic_imax:
7502    case nir_intrinsic_ssbo_atomic_umax:
7503    case nir_intrinsic_ssbo_atomic_and:
7504    case nir_intrinsic_ssbo_atomic_or:
7505    case nir_intrinsic_ssbo_atomic_xor:
7506    case nir_intrinsic_ssbo_atomic_exchange:
7507    case nir_intrinsic_ssbo_atomic_comp_swap:
7508       visit_atomic_ssbo(ctx, instr);
7509       break;
7510    case nir_intrinsic_load_scratch:
7511       visit_load_scratch(ctx, instr);
7512       break;
7513    case nir_intrinsic_store_scratch:
7514       visit_store_scratch(ctx, instr);
7515       break;
7516    case nir_intrinsic_get_buffer_size:
7517       visit_get_buffer_size(ctx, instr);
7518       break;
7519    case nir_intrinsic_control_barrier: {
7520       if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
7521          /* GFX6 only (thanks to a hw bug workaround):
7522           * The real barrier instruction isn’t needed, because an entire patch
7523           * always fits into a single wave.
7524           */
7525          break;
7526       }
7527
7528       if (ctx->program->workgroup_size > ctx->program->wave_size)
7529          bld.sopp(aco_opcode::s_barrier);
7530
7531       break;
7532    }
7533    case nir_intrinsic_memory_barrier_tcs_patch:
7534    case nir_intrinsic_group_memory_barrier:
7535    case nir_intrinsic_memory_barrier:
7536    case nir_intrinsic_memory_barrier_buffer:
7537    case nir_intrinsic_memory_barrier_image:
7538    case nir_intrinsic_memory_barrier_shared:
7539       emit_memory_barrier(ctx, instr);
7540       break;
7541    case nir_intrinsic_load_num_work_groups: {
7542       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7543       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
7544       emit_split_vector(ctx, dst, 3);
7545       break;
7546    }
7547    case nir_intrinsic_load_local_invocation_id: {
7548       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7549       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
7550       emit_split_vector(ctx, dst, 3);
7551       break;
7552    }
7553    case nir_intrinsic_load_work_group_id: {
7554       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7555       struct ac_arg *args = ctx->args->ac.workgroup_ids;
7556       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7557                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
7558                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
7559                  args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
7560       emit_split_vector(ctx, dst, 3);
7561       break;
7562    }
7563    case nir_intrinsic_load_local_invocation_index: {
7564       Temp id = emit_mbcnt(ctx, bld.def(v1));
7565
7566       /* The tg_size bits [6:11] contain the subgroup id,
7567        * we need this multiplied by the wave size, and then OR the thread id to it.
7568        */
7569       if (ctx->program->wave_size == 64) {
7570          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
7571          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
7572                                 get_arg(ctx, ctx->args->ac.tg_size));
7573          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
7574       } else {
7575          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR  */
7576          Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7577                                 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7578          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
7579       }
7580       break;
7581    }
7582    case nir_intrinsic_load_subgroup_id: {
7583       if (ctx->stage == compute_cs) {
7584          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
7585                   get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7586       } else {
7587          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
7588       }
7589       break;
7590    }
7591    case nir_intrinsic_load_subgroup_invocation: {
7592       emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
7593       break;
7594    }
7595    case nir_intrinsic_load_num_subgroups: {
7596       if (ctx->stage == compute_cs)
7597          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
7598                   get_arg(ctx, ctx->args->ac.tg_size));
7599       else
7600          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
7601       break;
7602    }
7603    case nir_intrinsic_ballot: {
7604       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7605       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7606       Definition tmp = bld.def(dst.regClass());
7607       Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass());
7608       if (instr->src[0].ssa->bit_size == 1) {
7609          assert(src.regClass() == bld.lm);
7610          bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
7611       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
7612          bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
7613       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
7614          bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
7615       } else {
7616          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7617          nir_print_instr(&instr->instr, stderr);
7618          fprintf(stderr, "\n");
7619       }
7620       if (dst.size() != bld.lm.size()) {
7621          /* Wave32 with ballot size set to 64 */
7622          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
7623       }
7624       emit_wqm(ctx, tmp.getTemp(), dst);
7625       break;
7626    }
7627    case nir_intrinsic_shuffle:
7628    case nir_intrinsic_read_invocation: {
7629       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7630       if (!nir_src_is_divergent(instr->src[0])) {
7631          emit_uniform_subgroup(ctx, instr, src);
7632       } else {
7633          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
7634          if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
7635             tid = bld.as_uniform(tid);
7636          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7637          if (src.regClass() == v1b || src.regClass() == v2b) {
7638             Temp tmp = bld.tmp(v1);
7639             tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp);
7640             if (dst.type() == RegType::vgpr)
7641                bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
7642             else
7643                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
7644          } else if (src.regClass() == v1) {
7645             emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
7646          } else if (src.regClass() == v2) {
7647             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7648             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7649             lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
7650             hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
7651             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7652             emit_split_vector(ctx, dst, 2);
7653          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
7654             assert(src.regClass() == bld.lm);
7655             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
7656             bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7657          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
7658             assert(src.regClass() == bld.lm);
7659             Temp tmp;
7660             if (ctx->program->chip_class <= GFX7)
7661                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
7662             else if (ctx->program->wave_size == 64)
7663                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
7664             else
7665                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
7666             tmp = emit_extract_vector(ctx, tmp, 0, v1);
7667             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
7668             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
7669          } else {
7670             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7671             nir_print_instr(&instr->instr, stderr);
7672             fprintf(stderr, "\n");
7673          }
7674       }
7675       break;
7676    }
7677    case nir_intrinsic_load_sample_id: {
7678       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7679                get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
7680       break;
7681    }
7682    case nir_intrinsic_load_sample_mask_in: {
7683       visit_load_sample_mask_in(ctx, instr);
7684       break;
7685    }
7686    case nir_intrinsic_read_first_invocation: {
7687       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7688       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7689       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
7690          emit_wqm(ctx,
7691                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
7692                   dst);
7693       } else if (src.regClass() == v2) {
7694          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7695          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7696          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
7697          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
7698          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7699          emit_split_vector(ctx, dst, 2);
7700       } else if (instr->dest.ssa.bit_size == 1) {
7701          assert(src.regClass() == bld.lm);
7702          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
7703                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
7704          bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7705       } else if (src.regClass() == s1) {
7706          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
7707       } else if (src.regClass() == s2) {
7708          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
7709       } else {
7710          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7711          nir_print_instr(&instr->instr, stderr);
7712          fprintf(stderr, "\n");
7713       }
7714       break;
7715    }
7716    case nir_intrinsic_vote_all: {
7717       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7718       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7719       assert(src.regClass() == bld.lm);
7720       assert(dst.regClass() == bld.lm);
7721
7722       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7723       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7724       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
7725       break;
7726    }
7727    case nir_intrinsic_vote_any: {
7728       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7729       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7730       assert(src.regClass() == bld.lm);
7731       assert(dst.regClass() == bld.lm);
7732
7733       Temp tmp = bool_to_scalar_condition(ctx, src);
7734       bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7735       break;
7736    }
7737    case nir_intrinsic_reduce:
7738    case nir_intrinsic_inclusive_scan:
7739    case nir_intrinsic_exclusive_scan: {
7740       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7741       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7742       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
7743       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
7744          nir_intrinsic_cluster_size(instr) : 0;
7745       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
7746
7747       if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
7748          emit_uniform_subgroup(ctx, instr, src);
7749       } else if (instr->dest.ssa.bit_size == 1) {
7750          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
7751             op = nir_op_iand;
7752          else if (op == nir_op_iadd)
7753             op = nir_op_ixor;
7754          else if (op == nir_op_umax || op == nir_op_imax)
7755             op = nir_op_ior;
7756          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
7757
7758          switch (instr->intrinsic) {
7759          case nir_intrinsic_reduce:
7760             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
7761             break;
7762          case nir_intrinsic_exclusive_scan:
7763             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
7764             break;
7765          case nir_intrinsic_inclusive_scan:
7766             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
7767             break;
7768          default:
7769             assert(false);
7770          }
7771       } else if (cluster_size == 1) {
7772          bld.copy(Definition(dst), src);
7773       } else {
7774          unsigned bit_size = instr->src[0].ssa->bit_size;
7775
7776          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
7777
7778          ReduceOp reduce_op;
7779          switch (op) {
7780          #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
7781          #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
7782             CASEI(iadd)
7783             CASEI(imul)
7784             CASEI(imin)
7785             CASEI(umin)
7786             CASEI(imax)
7787             CASEI(umax)
7788             CASEI(iand)
7789             CASEI(ior)
7790             CASEI(ixor)
7791             CASEF(fadd)
7792             CASEF(fmul)
7793             CASEF(fmin)
7794             CASEF(fmax)
7795             default:
7796                unreachable("unknown reduction op");
7797          #undef CASEI
7798          #undef CASEF
7799          }
7800
7801          aco_opcode aco_op;
7802          switch (instr->intrinsic) {
7803             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
7804             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
7805             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
7806             default:
7807                unreachable("unknown reduce intrinsic");
7808          }
7809
7810          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
7811          reduce->operands[0] = Operand(src);
7812          // filled in by aco_reduce_assign.cpp, used internally as part of the
7813          // reduce sequence
7814          assert(dst.size() == 1 || dst.size() == 2);
7815          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7816          reduce->operands[2] = Operand(v1.as_linear());
7817
7818          Temp tmp_dst = bld.tmp(dst.regClass());
7819          reduce->definitions[0] = Definition(tmp_dst);
7820          reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
7821          reduce->definitions[2] = Definition();
7822          reduce->definitions[3] = Definition(scc, s1);
7823          reduce->definitions[4] = Definition();
7824          reduce->reduce_op = reduce_op;
7825          reduce->cluster_size = cluster_size;
7826          ctx->block->instructions.emplace_back(std::move(reduce));
7827
7828          emit_wqm(ctx, tmp_dst, dst);
7829       }
7830       break;
7831    }
7832    case nir_intrinsic_quad_broadcast: {
7833       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7834       if (!nir_dest_is_divergent(instr->dest)) {
7835          emit_uniform_subgroup(ctx, instr, src);
7836       } else {
7837          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7838          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
7839          uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
7840
7841          if (instr->dest.ssa.bit_size == 1) {
7842             assert(src.regClass() == bld.lm);
7843             assert(dst.regClass() == bld.lm);
7844             uint32_t half_mask = 0x11111111u << lane;
7845             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
7846             Temp tmp = bld.tmp(bld.lm);
7847             bld.sop1(Builder::s_wqm, Definition(tmp),
7848                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
7849                               bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
7850             emit_wqm(ctx, tmp, dst);
7851          } else if (instr->dest.ssa.bit_size == 8) {
7852             Temp tmp = bld.tmp(v1);
7853             if (ctx->program->chip_class >= GFX8)
7854                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7855             else
7856                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7857             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7858          } else if (instr->dest.ssa.bit_size == 16) {
7859             Temp tmp = bld.tmp(v1);
7860             if (ctx->program->chip_class >= GFX8)
7861                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7862             else
7863                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7864             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7865          } else if (instr->dest.ssa.bit_size == 32) {
7866             if (ctx->program->chip_class >= GFX8)
7867                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
7868             else
7869                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
7870          } else if (instr->dest.ssa.bit_size == 64) {
7871             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7872             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7873             if (ctx->program->chip_class >= GFX8) {
7874                lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7875                hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7876             } else {
7877                lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
7878                hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
7879             }
7880             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7881             emit_split_vector(ctx, dst, 2);
7882          } else {
7883             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7884             nir_print_instr(&instr->instr, stderr);
7885             fprintf(stderr, "\n");
7886          }
7887       }
7888       break;
7889    }
7890    case nir_intrinsic_quad_swap_horizontal:
7891    case nir_intrinsic_quad_swap_vertical:
7892    case nir_intrinsic_quad_swap_diagonal:
7893    case nir_intrinsic_quad_swizzle_amd: {
7894       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7895       if (!nir_dest_is_divergent(instr->dest)) {
7896          emit_uniform_subgroup(ctx, instr, src);
7897          break;
7898       }
7899       uint16_t dpp_ctrl = 0;
7900       switch (instr->intrinsic) {
7901       case nir_intrinsic_quad_swap_horizontal:
7902          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
7903          break;
7904       case nir_intrinsic_quad_swap_vertical:
7905          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
7906          break;
7907       case nir_intrinsic_quad_swap_diagonal:
7908          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
7909          break;
7910       case nir_intrinsic_quad_swizzle_amd:
7911          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
7912          break;
7913       default:
7914          break;
7915       }
7916       if (ctx->program->chip_class < GFX8)
7917          dpp_ctrl |= (1 << 15);
7918
7919       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7920       if (instr->dest.ssa.bit_size == 1) {
7921          assert(src.regClass() == bld.lm);
7922          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7923          if (ctx->program->chip_class >= GFX8)
7924             src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7925          else
7926             src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7927          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7928          emit_wqm(ctx, tmp, dst);
7929       } else if (instr->dest.ssa.bit_size == 8) {
7930          Temp tmp = bld.tmp(v1);
7931          if (ctx->program->chip_class >= GFX8)
7932             emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7933          else
7934             emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7935          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7936       } else if (instr->dest.ssa.bit_size == 16) {
7937          Temp tmp = bld.tmp(v1);
7938          if (ctx->program->chip_class >= GFX8)
7939             emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7940          else
7941             emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7942          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7943       } else if (instr->dest.ssa.bit_size == 32) {
7944          Temp tmp;
7945          if (ctx->program->chip_class >= GFX8)
7946             tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7947          else
7948             tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7949          emit_wqm(ctx, tmp, dst);
7950       } else if (instr->dest.ssa.bit_size == 64) {
7951          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7952          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7953          if (ctx->program->chip_class >= GFX8) {
7954             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7955             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7956          } else {
7957             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
7958             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
7959          }
7960          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7961          emit_split_vector(ctx, dst, 2);
7962       } else {
7963          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7964          nir_print_instr(&instr->instr, stderr);
7965          fprintf(stderr, "\n");
7966       }
7967       break;
7968    }
7969    case nir_intrinsic_masked_swizzle_amd: {
7970       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7971       if (!nir_dest_is_divergent(instr->dest)) {
7972          emit_uniform_subgroup(ctx, instr, src);
7973          break;
7974       }
7975       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7976       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
7977       if (instr->dest.ssa.bit_size == 1) {
7978          assert(src.regClass() == bld.lm);
7979          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7980          src = emit_masked_swizzle(ctx, bld, src, mask);
7981          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7982          emit_wqm(ctx, tmp, dst);
7983       } else if (dst.regClass() == v1b) {
7984          Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
7985          emit_extract_vector(ctx, tmp, 0, dst);
7986       } else if (dst.regClass() == v2b) {
7987          Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
7988          emit_extract_vector(ctx, tmp, 0, dst);
7989       } else if (dst.regClass() == v1) {
7990          emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst);
7991       } else if (dst.regClass() == v2) {
7992          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7993          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7994          lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
7995          hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
7996          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7997          emit_split_vector(ctx, dst, 2);
7998       } else {
7999          fprintf(stderr, "Unimplemented NIR instr bit size: ");
8000          nir_print_instr(&instr->instr, stderr);
8001          fprintf(stderr, "\n");
8002       }
8003       break;
8004    }
8005    case nir_intrinsic_write_invocation_amd: {
8006       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8007       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8008       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8009       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8010       if (dst.regClass() == v1) {
8011          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8012          emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
8013       } else if (dst.regClass() == v2) {
8014          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8015          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8016          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8017          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8018          Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8019          Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8020          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8021          emit_split_vector(ctx, dst, 2);
8022       } else {
8023          fprintf(stderr, "Unimplemented NIR instr bit size: ");
8024          nir_print_instr(&instr->instr, stderr);
8025          fprintf(stderr, "\n");
8026       }
8027       break;
8028    }
8029    case nir_intrinsic_mbcnt_amd: {
8030       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8031       RegClass rc = RegClass(src.type(), 1);
8032       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
8033       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
8034       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8035       Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi));
8036       emit_wqm(ctx, wqm_tmp, dst);
8037       break;
8038    }
8039    case nir_intrinsic_load_helper_invocation: {
8040       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8041       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
8042       ctx->block->kind |= block_kind_needs_lowering;
8043       ctx->program->needs_exact = true;
8044       break;
8045    }
8046    case nir_intrinsic_is_helper_invocation: {
8047       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8048       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
8049       ctx->block->kind |= block_kind_needs_lowering;
8050       ctx->program->needs_exact = true;
8051       break;
8052    }
8053    case nir_intrinsic_demote:
8054       bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u));
8055
8056       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8057          ctx->cf_info.exec_potentially_empty_discard = true;
8058       ctx->block->kind |= block_kind_uses_demote;
8059       ctx->program->needs_exact = true;
8060       break;
8061    case nir_intrinsic_demote_if: {
8062       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8063       assert(src.regClass() == bld.lm);
8064       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8065       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8066
8067       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8068          ctx->cf_info.exec_potentially_empty_discard = true;
8069       ctx->block->kind |= block_kind_uses_demote;
8070       ctx->program->needs_exact = true;
8071       break;
8072    }
8073    case nir_intrinsic_first_invocation: {
8074       emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8075                get_ssa_temp(ctx, &instr->dest.ssa));
8076       break;
8077    }
8078    case nir_intrinsic_shader_clock: {
8079       aco_opcode opcode =
8080          nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
8081             aco_opcode::s_memrealtime : aco_opcode::s_memtime;
8082       bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
8083       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
8084       break;
8085    }
8086    case nir_intrinsic_load_vertex_id_zero_base: {
8087       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8088       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8089       break;
8090    }
8091    case nir_intrinsic_load_first_vertex: {
8092       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8093       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8094       break;
8095    }
8096    case nir_intrinsic_load_base_instance: {
8097       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8098       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8099       break;
8100    }
8101    case nir_intrinsic_load_instance_id: {
8102       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8103       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8104       break;
8105    }
8106    case nir_intrinsic_load_draw_id: {
8107       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8108       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8109       break;
8110    }
8111    case nir_intrinsic_load_invocation_id: {
8112       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8113
8114       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8115          if (ctx->options->chip_class >= GFX10)
8116             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8117          else
8118             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8119       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8120          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
8121                   get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
8122       } else {
8123          unreachable("Unsupported stage for load_invocation_id");
8124       }
8125
8126       break;
8127    }
8128    case nir_intrinsic_load_primitive_id: {
8129       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8130
8131       switch (ctx->shader->info.stage) {
8132       case MESA_SHADER_GEOMETRY:
8133          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8134          break;
8135       case MESA_SHADER_TESS_CTRL:
8136          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8137          break;
8138       case MESA_SHADER_TESS_EVAL:
8139          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8140          break;
8141       default:
8142          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8143       }
8144
8145       break;
8146    }
8147    case nir_intrinsic_load_patch_vertices_in: {
8148       assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8149              ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8150
8151       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8152       bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices));
8153       break;
8154    }
8155    case nir_intrinsic_emit_vertex_with_counter: {
8156       visit_emit_vertex_with_counter(ctx, instr);
8157       break;
8158    }
8159    case nir_intrinsic_end_primitive_with_counter: {
8160       unsigned stream = nir_intrinsic_stream_id(instr);
8161       bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
8162       break;
8163    }
8164    case nir_intrinsic_set_vertex_count: {
8165       /* unused, the HW keeps track of this for us */
8166       break;
8167    }
8168    default:
8169       fprintf(stderr, "Unimplemented intrinsic instr: ");
8170       nir_print_instr(&instr->instr, stderr);
8171       fprintf(stderr, "\n");
8172       abort();
8173
8174       break;
8175    }
8176 }
8177
8178
8179 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
8180                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
8181                     enum glsl_base_type *stype)
8182 {
8183    nir_deref_instr *texture_deref_instr = NULL;
8184    nir_deref_instr *sampler_deref_instr = NULL;
8185    int plane = -1;
8186
8187    for (unsigned i = 0; i < instr->num_srcs; i++) {
8188       switch (instr->src[i].src_type) {
8189       case nir_tex_src_texture_deref:
8190          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
8191          break;
8192       case nir_tex_src_sampler_deref:
8193          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
8194          break;
8195       case nir_tex_src_plane:
8196          plane = nir_src_as_int(instr->src[i].src);
8197          break;
8198       default:
8199          break;
8200       }
8201    }
8202
8203    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
8204
8205    if (!sampler_deref_instr)
8206       sampler_deref_instr = texture_deref_instr;
8207
8208    if (plane >= 0) {
8209       assert(instr->op != nir_texop_txf_ms &&
8210              instr->op != nir_texop_samples_identical);
8211       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
8212       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
8213    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
8214       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
8215    } else if (instr->op == nir_texop_fragment_mask_fetch) {
8216       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8217    } else {
8218       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
8219    }
8220    if (samp_ptr) {
8221       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
8222
8223       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
8224          /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
8225          Builder bld(ctx->program, ctx->block);
8226
8227          /* to avoid unnecessary moves, we split and recombine sampler and image */
8228          Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
8229                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8230          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8231          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
8232                     Definition(img[2]), Definition(img[3]), Definition(img[4]),
8233                     Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
8234          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
8235                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
8236
8237          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
8238          *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
8239                                img[0], img[1], img[2], img[3],
8240                                img[4], img[5], img[6], img[7]);
8241          *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
8242                                 samp[0], samp[1], samp[2], samp[3]);
8243       }
8244    }
8245    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
8246                      instr->op == nir_texop_samples_identical))
8247       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8248 }
8249
8250 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
8251                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
8252 {
8253    Builder bld(ctx->program, ctx->block);
8254
8255    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
8256    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
8257    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
8258
8259    Operand neg_one(0xbf800000u);
8260    Operand one(0x3f800000u);
8261    Operand two(0x40000000u);
8262    Operand four(0x40800000u);
8263
8264    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
8265    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
8266    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
8267
8268    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
8269    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
8270    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
8271    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
8272
8273    // select sc
8274    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
8275    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
8276                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
8277                        one, is_ma_y);
8278    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8279
8280    // select tc
8281    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
8282    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
8283    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8284
8285    // select ma
8286    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8287                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
8288                   deriv_z, is_ma_z);
8289    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
8290    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
8291 }
8292
8293 void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
8294 {
8295    Builder bld(ctx->program, ctx->block);
8296    Temp ma, tc, sc, id;
8297
8298    if (is_array) {
8299       coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
8300
8301       // see comment in ac_prepare_cube_coords()
8302       if (ctx->options->chip_class <= GFX8)
8303          coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
8304    }
8305
8306    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8307
8308    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
8309    vop3a->operands[0] = Operand(ma);
8310    vop3a->abs[0] = true;
8311    Temp invma = bld.tmp(v1);
8312    vop3a->definitions[0] = Definition(invma);
8313    ctx->block->instructions.emplace_back(std::move(vop3a));
8314
8315    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8316    if (!is_deriv)
8317       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
8318
8319    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8320    if (!is_deriv)
8321       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
8322
8323    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8324
8325    if (is_deriv) {
8326       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
8327       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
8328
8329       for (unsigned i = 0; i < 2; i++) {
8330          // see comment in ac_prepare_cube_coords()
8331          Temp deriv_ma;
8332          Temp deriv_sc, deriv_tc;
8333          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
8334                            &deriv_ma, &deriv_sc, &deriv_tc);
8335
8336          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
8337
8338          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8339                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
8340                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
8341          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8342                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
8343                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
8344          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
8345       }
8346
8347       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
8348       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
8349    }
8350
8351    if (is_array)
8352       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
8353    coords.resize(3);
8354    coords[0] = sc;
8355    coords[1] = tc;
8356    coords[2] = id;
8357 }
8358
8359 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
8360 {
8361    if (vec->parent_instr->type != nir_instr_type_alu)
8362       return;
8363    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
8364    if (vec_instr->op != nir_op_vec(vec->num_components))
8365       return;
8366
8367    for (unsigned i = 0; i < vec->num_components; i++) {
8368       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
8369               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
8370    }
8371 }
8372
8373 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
8374 {
8375    Builder bld(ctx->program, ctx->block);
8376    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
8377         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
8378         has_clamped_lod = false;
8379    Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
8380         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
8381         clamped_lod = Temp();
8382    std::vector<Temp> coords;
8383    std::vector<Temp> derivs;
8384    nir_const_value *sample_index_cv = NULL;
8385    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
8386    enum glsl_base_type stype;
8387    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
8388
8389    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
8390                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
8391    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
8392                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
8393
8394    for (unsigned i = 0; i < instr->num_srcs; i++) {
8395       switch (instr->src[i].src_type) {
8396       case nir_tex_src_coord: {
8397          Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
8398          for (unsigned i = 0; i < coord.size(); i++)
8399             coords.emplace_back(emit_extract_vector(ctx, coord, i, v1));
8400          break;
8401       }
8402       case nir_tex_src_bias:
8403          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
8404          has_bias = true;
8405          break;
8406       case nir_tex_src_lod: {
8407          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
8408
8409          if (val && val->f32 <= 0.0) {
8410             level_zero = true;
8411          } else {
8412             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8413             has_lod = true;
8414          }
8415          break;
8416       }
8417       case nir_tex_src_min_lod:
8418          clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8419          has_clamped_lod = true;
8420          break;
8421       case nir_tex_src_comparator:
8422          if (instr->is_shadow) {
8423             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
8424             has_compare = true;
8425          }
8426          break;
8427       case nir_tex_src_offset:
8428          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
8429          get_const_vec(instr->src[i].src.ssa, const_offset);
8430          has_offset = true;
8431          break;
8432       case nir_tex_src_ddx:
8433          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
8434          has_ddx = true;
8435          break;
8436       case nir_tex_src_ddy:
8437          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
8438          has_ddy = true;
8439          break;
8440       case nir_tex_src_ms_index:
8441          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
8442          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
8443          has_sample_index = true;
8444          break;
8445       case nir_tex_src_texture_offset:
8446       case nir_tex_src_sampler_offset:
8447       default:
8448          break;
8449       }
8450    }
8451
8452    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
8453       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
8454
8455    if (instr->op == nir_texop_texture_samples) {
8456       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
8457
8458       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
8459       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
8460       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
8461
8462       Operand default_sample = Operand(1u);
8463       if (ctx->options->robust_buffer_access) {
8464          /* Extract the second dword of the descriptor, if it's
8465           * all zero, then it's a null descriptor.
8466           */
8467          Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
8468          Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
8469          default_sample = Operand(is_non_null_descriptor);
8470       }
8471
8472       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
8473       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8474                samples, default_sample, bld.scc(is_msaa));
8475       return;
8476    }
8477
8478    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
8479       aco_ptr<Instruction> tmp_instr;
8480       Temp acc, pack = Temp();
8481
8482       uint32_t pack_const = 0;
8483       for (unsigned i = 0; i < offset.size(); i++) {
8484          if (!const_offset[i])
8485             continue;
8486          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
8487       }
8488
8489       if (offset.type() == RegType::sgpr) {
8490          for (unsigned i = 0; i < offset.size(); i++) {
8491             if (const_offset[i])
8492                continue;
8493
8494             acc = emit_extract_vector(ctx, offset, i, s1);
8495             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
8496
8497             if (i) {
8498                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
8499             }
8500
8501             if (pack == Temp()) {
8502                pack = acc;
8503             } else {
8504                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
8505             }
8506          }
8507
8508          if (pack_const && pack != Temp())
8509             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
8510       } else {
8511          for (unsigned i = 0; i < offset.size(); i++) {
8512             if (const_offset[i])
8513                continue;
8514
8515             acc = emit_extract_vector(ctx, offset, i, v1);
8516             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
8517
8518             if (i) {
8519                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
8520             }
8521
8522             if (pack == Temp()) {
8523                pack = acc;
8524             } else {
8525                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
8526             }
8527          }
8528
8529          if (pack_const && pack != Temp())
8530             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
8531       }
8532       if (pack_const && pack == Temp())
8533          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
8534       else if (pack == Temp())
8535          has_offset = false;
8536       else
8537          offset = pack;
8538    }
8539
8540    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
8541       prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
8542
8543    /* pack derivatives */
8544    if (has_ddx || has_ddy) {
8545       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
8546          assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
8547          Temp zero = bld.copy(bld.def(v1), Operand(0u));
8548          derivs = {ddx, zero, ddy, zero};
8549       } else {
8550          for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
8551             derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
8552          for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
8553             derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
8554       }
8555       has_derivs = true;
8556    }
8557
8558    if (instr->coord_components > 1 &&
8559        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8560        instr->is_array &&
8561        instr->op != nir_texop_txf)
8562       coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
8563
8564    if (instr->coord_components > 2 &&
8565       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
8566        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8567        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
8568        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8569        instr->is_array &&
8570        instr->op != nir_texop_txf &&
8571        instr->op != nir_texop_txf_ms &&
8572        instr->op != nir_texop_fragment_fetch &&
8573        instr->op != nir_texop_fragment_mask_fetch)
8574       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
8575
8576    if (ctx->options->chip_class == GFX9 &&
8577        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8578        instr->op != nir_texop_lod && instr->coord_components) {
8579       assert(coords.size() > 0 && coords.size() < 3);
8580
8581       coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
8582                                                                      Operand((uint32_t) 0) :
8583                                                                      Operand((uint32_t) 0x3f000000)));
8584    }
8585
8586    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
8587
8588    if (instr->op == nir_texop_samples_identical)
8589       resource = fmask_ptr;
8590
8591    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8592              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8593             instr->op != nir_texop_txs &&
8594             instr->op != nir_texop_fragment_fetch &&
8595             instr->op != nir_texop_fragment_mask_fetch) {
8596       assert(has_sample_index);
8597       Operand op(sample_index);
8598       if (sample_index_cv)
8599          op = Operand(sample_index_cv->u32);
8600       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
8601    }
8602
8603    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
8604       for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
8605          Temp off = emit_extract_vector(ctx, offset, i, v1);
8606          coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
8607       }
8608       has_offset = false;
8609    }
8610
8611    /* Build tex instruction */
8612    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
8613    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
8614                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
8615                   : 0;
8616    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8617    Temp tmp_dst = dst;
8618
8619    /* gather4 selects the component by dmask and always returns vec4 */
8620    if (instr->op == nir_texop_tg4) {
8621       assert(instr->dest.ssa.num_components == 4);
8622       if (instr->is_shadow)
8623          dmask = 1;
8624       else
8625          dmask = 1 << instr->component;
8626       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
8627          tmp_dst = bld.tmp(v4);
8628    } else if (instr->op == nir_texop_samples_identical) {
8629       tmp_dst = bld.tmp(v1);
8630    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
8631       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
8632    }
8633
8634    aco_ptr<MIMG_instruction> tex;
8635    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
8636       if (!has_lod)
8637          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8638
8639       bool div_by_6 = instr->op == nir_texop_txs &&
8640                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
8641                       instr->is_array &&
8642                       (dmask & (1 << 2));
8643       if (tmp_dst.id() == dst.id() && div_by_6)
8644          tmp_dst = bld.tmp(tmp_dst.regClass());
8645
8646       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8647       tex->operands[0] = Operand(resource);
8648       tex->operands[1] = Operand(s4); /* no sampler */
8649       tex->operands[2] = Operand(as_vgpr(ctx,lod));
8650       if (ctx->options->chip_class == GFX9 &&
8651           instr->op == nir_texop_txs &&
8652           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8653           instr->is_array) {
8654          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
8655       } else if (instr->op == nir_texop_query_levels) {
8656          tex->dmask = 1 << 3;
8657       } else {
8658          tex->dmask = dmask;
8659       }
8660       tex->da = da;
8661       tex->definitions[0] = Definition(tmp_dst);
8662       tex->dim = dim;
8663       tex->can_reorder = true;
8664       ctx->block->instructions.emplace_back(std::move(tex));
8665
8666       if (div_by_6) {
8667          /* divide 3rd value by 6 by multiplying with magic number */
8668          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8669          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
8670          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
8671          assert(instr->dest.ssa.num_components == 3);
8672          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
8673          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8674                               emit_extract_vector(ctx, tmp_dst, 0, v1),
8675                               emit_extract_vector(ctx, tmp_dst, 1, v1),
8676                               by_6);
8677
8678       }
8679
8680       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8681       return;
8682    }
8683
8684    Temp tg4_compare_cube_wa64 = Temp();
8685
8686    if (tg4_integer_workarounds) {
8687       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8688       tex->operands[0] = Operand(resource);
8689       tex->operands[1] = Operand(s4); /* no sampler */
8690       tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8691       tex->dim = dim;
8692       tex->dmask = 0x3;
8693       tex->da = da;
8694       Temp size = bld.tmp(v2);
8695       tex->definitions[0] = Definition(size);
8696       tex->can_reorder = true;
8697       ctx->block->instructions.emplace_back(std::move(tex));
8698       emit_split_vector(ctx, size, size.size());
8699
8700       Temp half_texel[2];
8701       for (unsigned i = 0; i < 2; i++) {
8702          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
8703          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
8704          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
8705          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
8706       }
8707
8708       Temp new_coords[2] = {
8709          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
8710          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
8711       };
8712
8713       if (tg4_integer_cube_workaround) {
8714          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
8715          Temp desc[resource.size()];
8716          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
8717                                                                            Format::PSEUDO, 1, resource.size())};
8718          split->operands[0] = Operand(resource);
8719          for (unsigned i = 0; i < resource.size(); i++) {
8720             desc[i] = bld.tmp(s1);
8721             split->definitions[i] = Definition(desc[i]);
8722          }
8723          ctx->block->instructions.emplace_back(std::move(split));
8724
8725          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
8726          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
8727                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
8728
8729          Temp nfmt;
8730          if (stype == GLSL_TYPE_UINT) {
8731             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8732                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
8733                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
8734                             bld.scc(compare_cube_wa));
8735          } else {
8736             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8737                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
8738                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
8739                             bld.scc(compare_cube_wa));
8740          }
8741          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
8742          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
8743
8744          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
8745
8746          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
8747                             Operand((uint32_t)C_008F14_NUM_FORMAT));
8748          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
8749
8750          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
8751                                                                          Format::PSEUDO, resource.size(), 1)};
8752          for (unsigned i = 0; i < resource.size(); i++)
8753             vec->operands[i] = Operand(desc[i]);
8754          resource = bld.tmp(resource.regClass());
8755          vec->definitions[0] = Definition(resource);
8756          ctx->block->instructions.emplace_back(std::move(vec));
8757
8758          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8759                                   new_coords[0], coords[0], tg4_compare_cube_wa64);
8760          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8761                                   new_coords[1], coords[1], tg4_compare_cube_wa64);
8762       }
8763       coords[0] = new_coords[0];
8764       coords[1] = new_coords[1];
8765    }
8766
8767    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
8768       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
8769
8770       assert(coords.size() == 1);
8771       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
8772       aco_opcode op;
8773       switch (last_bit) {
8774       case 1:
8775          op = aco_opcode::buffer_load_format_x; break;
8776       case 2:
8777          op = aco_opcode::buffer_load_format_xy; break;
8778       case 3:
8779          op = aco_opcode::buffer_load_format_xyz; break;
8780       case 4:
8781          op = aco_opcode::buffer_load_format_xyzw; break;
8782       default:
8783          unreachable("Tex instruction loads more than 4 components.");
8784       }
8785
8786       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
8787       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
8788          tmp_dst = dst;
8789       else
8790          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
8791
8792       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
8793       mubuf->operands[0] = Operand(resource);
8794       mubuf->operands[1] = Operand(coords[0]);
8795       mubuf->operands[2] = Operand((uint32_t) 0);
8796       mubuf->definitions[0] = Definition(tmp_dst);
8797       mubuf->idxen = true;
8798       mubuf->can_reorder = true;
8799       ctx->block->instructions.emplace_back(std::move(mubuf));
8800
8801       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
8802       return;
8803    }
8804
8805    /* gather MIMG address components */
8806    std::vector<Temp> args;
8807    if (has_offset)
8808       args.emplace_back(offset);
8809    if (has_bias)
8810       args.emplace_back(bias);
8811    if (has_compare)
8812       args.emplace_back(compare);
8813    if (has_derivs)
8814       args.insert(args.end(), derivs.begin(), derivs.end());
8815
8816    args.insert(args.end(), coords.begin(), coords.end());
8817    if (has_sample_index)
8818       args.emplace_back(sample_index);
8819    if (has_lod)
8820       args.emplace_back(lod);
8821    if (has_clamped_lod)
8822       args.emplace_back(clamped_lod);
8823
8824    Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
8825    aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
8826    vec->definitions[0] = Definition(arg);
8827    for (unsigned i = 0; i < args.size(); i++)
8828       vec->operands[i] = Operand(args[i]);
8829    ctx->block->instructions.emplace_back(std::move(vec));
8830
8831
8832    if (instr->op == nir_texop_txf ||
8833        instr->op == nir_texop_txf_ms ||
8834        instr->op == nir_texop_samples_identical ||
8835        instr->op == nir_texop_fragment_fetch ||
8836        instr->op == nir_texop_fragment_mask_fetch) {
8837       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
8838       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
8839       tex->operands[0] = Operand(resource);
8840       tex->operands[1] = Operand(s4); /* no sampler */
8841       tex->operands[2] = Operand(arg);
8842       tex->dim = dim;
8843       tex->dmask = dmask;
8844       tex->unrm = true;
8845       tex->da = da;
8846       tex->definitions[0] = Definition(tmp_dst);
8847       tex->can_reorder = true;
8848       ctx->block->instructions.emplace_back(std::move(tex));
8849
8850       if (instr->op == nir_texop_samples_identical) {
8851          assert(dmask == 1 && dst.regClass() == v1);
8852          assert(dst.id() != tmp_dst.id());
8853
8854          Temp tmp = bld.tmp(bld.lm);
8855          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
8856          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
8857
8858       } else {
8859          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8860       }
8861       return;
8862    }
8863
8864    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
8865    aco_opcode opcode = aco_opcode::image_sample;
8866    if (has_offset) { /* image_sample_*_o */
8867       if (has_clamped_lod) {
8868          if (has_compare) {
8869             opcode = aco_opcode::image_sample_c_cl_o;
8870             if (has_derivs)
8871                opcode = aco_opcode::image_sample_c_d_cl_o;
8872             if (has_bias)
8873                opcode = aco_opcode::image_sample_c_b_cl_o;
8874          } else {
8875             opcode = aco_opcode::image_sample_cl_o;
8876             if (has_derivs)
8877                opcode = aco_opcode::image_sample_d_cl_o;
8878             if (has_bias)
8879                opcode = aco_opcode::image_sample_b_cl_o;
8880          }
8881       } else if (has_compare) {
8882          opcode = aco_opcode::image_sample_c_o;
8883          if (has_derivs)
8884             opcode = aco_opcode::image_sample_c_d_o;
8885          if (has_bias)
8886             opcode = aco_opcode::image_sample_c_b_o;
8887          if (level_zero)
8888             opcode = aco_opcode::image_sample_c_lz_o;
8889          if (has_lod)
8890             opcode = aco_opcode::image_sample_c_l_o;
8891       } else {
8892          opcode = aco_opcode::image_sample_o;
8893          if (has_derivs)
8894             opcode = aco_opcode::image_sample_d_o;
8895          if (has_bias)
8896             opcode = aco_opcode::image_sample_b_o;
8897          if (level_zero)
8898             opcode = aco_opcode::image_sample_lz_o;
8899          if (has_lod)
8900             opcode = aco_opcode::image_sample_l_o;
8901       }
8902    } else if (has_clamped_lod) { /* image_sample_*_cl */
8903       if (has_compare) {
8904          opcode = aco_opcode::image_sample_c_cl;
8905          if (has_derivs)
8906             opcode = aco_opcode::image_sample_c_d_cl;
8907          if (has_bias)
8908             opcode = aco_opcode::image_sample_c_b_cl;
8909       } else {
8910          opcode = aco_opcode::image_sample_cl;
8911          if (has_derivs)
8912             opcode = aco_opcode::image_sample_d_cl;
8913          if (has_bias)
8914             opcode = aco_opcode::image_sample_b_cl;
8915       }
8916    } else { /* no offset */
8917       if (has_compare) {
8918          opcode = aco_opcode::image_sample_c;
8919          if (has_derivs)
8920             opcode = aco_opcode::image_sample_c_d;
8921          if (has_bias)
8922             opcode = aco_opcode::image_sample_c_b;
8923          if (level_zero)
8924             opcode = aco_opcode::image_sample_c_lz;
8925          if (has_lod)
8926             opcode = aco_opcode::image_sample_c_l;
8927       } else {
8928          opcode = aco_opcode::image_sample;
8929          if (has_derivs)
8930             opcode = aco_opcode::image_sample_d;
8931          if (has_bias)
8932             opcode = aco_opcode::image_sample_b;
8933          if (level_zero)
8934             opcode = aco_opcode::image_sample_lz;
8935          if (has_lod)
8936             opcode = aco_opcode::image_sample_l;
8937       }
8938    }
8939
8940    if (instr->op == nir_texop_tg4) {
8941       if (has_offset) { /* image_gather4_*_o */
8942          if (has_compare) {
8943             opcode = aco_opcode::image_gather4_c_lz_o;
8944             if (has_lod)
8945                opcode = aco_opcode::image_gather4_c_l_o;
8946             if (has_bias)
8947                opcode = aco_opcode::image_gather4_c_b_o;
8948          } else {
8949             opcode = aco_opcode::image_gather4_lz_o;
8950             if (has_lod)
8951                opcode = aco_opcode::image_gather4_l_o;
8952             if (has_bias)
8953                opcode = aco_opcode::image_gather4_b_o;
8954          }
8955       } else {
8956          if (has_compare) {
8957             opcode = aco_opcode::image_gather4_c_lz;
8958             if (has_lod)
8959                opcode = aco_opcode::image_gather4_c_l;
8960             if (has_bias)
8961                opcode = aco_opcode::image_gather4_c_b;
8962          } else {
8963             opcode = aco_opcode::image_gather4_lz;
8964             if (has_lod)
8965                opcode = aco_opcode::image_gather4_l;
8966             if (has_bias)
8967                opcode = aco_opcode::image_gather4_b;
8968          }
8969       }
8970    } else if (instr->op == nir_texop_lod) {
8971       opcode = aco_opcode::image_get_lod;
8972    }
8973
8974    /* we don't need the bias, sample index, compare value or offset to be
8975     * computed in WQM but if the p_create_vector copies the coordinates, then it
8976     * needs to be in WQM */
8977    if (ctx->stage == fragment_fs &&
8978        !has_derivs && !has_lod && !level_zero &&
8979        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
8980        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
8981       arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
8982
8983    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
8984    tex->operands[0] = Operand(resource);
8985    tex->operands[1] = Operand(sampler);
8986    tex->operands[2] = Operand(arg);
8987    tex->dim = dim;
8988    tex->dmask = dmask;
8989    tex->da = da;
8990    tex->definitions[0] = Definition(tmp_dst);
8991    tex->can_reorder = true;
8992    ctx->block->instructions.emplace_back(std::move(tex));
8993
8994    if (tg4_integer_cube_workaround) {
8995       assert(tmp_dst.id() != dst.id());
8996       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
8997
8998       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8999       Temp val[4];
9000       for (unsigned i = 0; i < dst.size(); i++) {
9001          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9002          Temp cvt_val;
9003          if (stype == GLSL_TYPE_UINT)
9004             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9005          else
9006             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9007          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
9008       }
9009       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
9010       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
9011                            val[0], val[1], val[2], val[3]);
9012    }
9013    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
9014    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9015
9016 }
9017
9018
9019 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
9020 {
9021    Temp tmp = get_ssa_temp(ctx, ssa);
9022    if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9023       return Operand(rc);
9024    } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
9025       if (ctx->program->wave_size == 64)
9026          return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
9027       else
9028          return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
9029    } else {
9030       return Operand(tmp);
9031    }
9032 }
9033
9034 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
9035 {
9036    aco_ptr<Pseudo_instruction> phi;
9037    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9038    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9039
9040    bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9041    logical |= ctx->block->kind & block_kind_merge;
9042    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9043
9044    /* we want a sorted list of sources, since the predecessor list is also sorted */
9045    std::map<unsigned, nir_ssa_def*> phi_src;
9046    nir_foreach_phi_src(src, instr)
9047       phi_src[src->pred->index] = src->src.ssa;
9048
9049    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9050    unsigned num_operands = 0;
9051    Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1];
9052    unsigned num_defined = 0;
9053    unsigned cur_pred_idx = 0;
9054    for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
9055       if (cur_pred_idx < preds.size()) {
9056          /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
9057          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9058          unsigned skipped = 0;
9059          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9060             skipped++;
9061          if (cur_pred_idx + skipped < preds.size()) {
9062             for (unsigned i = 0; i < skipped; i++)
9063                operands[num_operands++] = Operand(dst.regClass());
9064             cur_pred_idx += skipped;
9065          } else {
9066             continue;
9067          }
9068       }
9069       /* Handle missing predecessors at the end. This shouldn't happen with loop
9070        * headers and we can't ignore these sources for loop header phis. */
9071       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9072          continue;
9073       cur_pred_idx++;
9074       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9075       operands[num_operands++] = op;
9076       num_defined += !op.isUndefined();
9077    }
9078    /* handle block_kind_continue_or_break at loop exit blocks */
9079    while (cur_pred_idx++ < preds.size())
9080       operands[num_operands++] = Operand(dst.regClass());
9081
9082    /* If the loop ends with a break, still add a linear continue edge in case
9083     * that break is divergent or continue_or_break is used. We'll either remove
9084     * this operand later in visit_loop() if it's not necessary or replace the
9085     * undef with something correct. */
9086    if (!logical && ctx->block->kind & block_kind_loop_header) {
9087       nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9088       nir_block *last = nir_loop_last_block(loop);
9089       if (last->successors[0] != instr->instr.block)
9090          operands[num_operands++] = Operand(RegClass());
9091    }
9092
9093    if (num_defined == 0) {
9094       Builder bld(ctx->program, ctx->block);
9095       if (dst.regClass() == s1) {
9096          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
9097       } else if (dst.regClass() == v1) {
9098          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
9099       } else {
9100          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9101          for (unsigned i = 0; i < dst.size(); i++)
9102             vec->operands[i] = Operand(0u);
9103          vec->definitions[0] = Definition(dst);
9104          ctx->block->instructions.emplace_back(std::move(vec));
9105       }
9106       return;
9107    }
9108
9109    /* we can use a linear phi in some cases if one src is undef */
9110    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9111       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
9112
9113       Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9114       Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9115       assert(invert->kind & block_kind_invert);
9116
9117       unsigned then_block = invert->linear_preds[0];
9118
9119       Block* insert_block = NULL;
9120       for (unsigned i = 0; i < num_operands; i++) {
9121          Operand op = operands[i];
9122          if (op.isUndefined())
9123             continue;
9124          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9125          phi->operands[0] = op;
9126          break;
9127       }
9128       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9129       phi->operands[1] = Operand(dst.regClass());
9130       phi->definitions[0] = Definition(dst);
9131       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9132       return;
9133    }
9134
9135    /* try to scalarize vector phis */
9136    if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
9137       // TODO: scalarize linear phis on divergent ifs
9138       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
9139       std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
9140       for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
9141          Operand src = operands[i];
9142          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
9143             can_scalarize = false;
9144       }
9145       if (can_scalarize) {
9146          unsigned num_components = instr->dest.ssa.num_components;
9147          assert(dst.size() % num_components == 0);
9148          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
9149
9150          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
9151          for (unsigned k = 0; k < num_components; k++) {
9152             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9153             for (unsigned i = 0; i < num_operands; i++) {
9154                Operand src = operands[i];
9155                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
9156             }
9157             Temp phi_dst = {ctx->program->allocateId(), rc};
9158             phi->definitions[0] = Definition(phi_dst);
9159             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9160             new_vec[k] = phi_dst;
9161             vec->operands[k] = Operand(phi_dst);
9162          }
9163          vec->definitions[0] = Definition(dst);
9164          ctx->block->instructions.emplace_back(std::move(vec));
9165          ctx->allocated_vec.emplace(dst.id(), new_vec);
9166          return;
9167       }
9168    }
9169
9170    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9171    for (unsigned i = 0; i < num_operands; i++)
9172       phi->operands[i] = operands[i];
9173    phi->definitions[0] = Definition(dst);
9174    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9175 }
9176
9177
9178 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
9179 {
9180    Temp dst = get_ssa_temp(ctx, &instr->def);
9181
9182    assert(dst.type() == RegType::sgpr);
9183
9184    if (dst.size() == 1) {
9185       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
9186    } else {
9187       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9188       for (unsigned i = 0; i < dst.size(); i++)
9189          vec->operands[i] = Operand(0u);
9190       vec->definitions[0] = Definition(dst);
9191       ctx->block->instructions.emplace_back(std::move(vec));
9192    }
9193 }
9194
9195 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
9196 {
9197    Builder bld(ctx->program, ctx->block);
9198    Block *logical_target;
9199    append_logical_end(ctx->block);
9200    unsigned idx = ctx->block->index;
9201
9202    switch (instr->type) {
9203    case nir_jump_break:
9204       logical_target = ctx->cf_info.parent_loop.exit;
9205       add_logical_edge(idx, logical_target);
9206       ctx->block->kind |= block_kind_break;
9207
9208       if (!ctx->cf_info.parent_if.is_divergent &&
9209           !ctx->cf_info.parent_loop.has_divergent_continue) {
9210          /* uniform break - directly jump out of the loop */
9211          ctx->block->kind |= block_kind_uniform;
9212          ctx->cf_info.has_branch = true;
9213          bld.branch(aco_opcode::p_branch);
9214          add_linear_edge(idx, logical_target);
9215          return;
9216       }
9217       ctx->cf_info.parent_loop.has_divergent_branch = true;
9218       ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9219       break;
9220    case nir_jump_continue:
9221       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9222       add_logical_edge(idx, logical_target);
9223       ctx->block->kind |= block_kind_continue;
9224
9225       if (ctx->cf_info.parent_if.is_divergent) {
9226          /* for potential uniform breaks after this continue,
9227             we must ensure that they are handled correctly */
9228          ctx->cf_info.parent_loop.has_divergent_continue = true;
9229          ctx->cf_info.parent_loop.has_divergent_branch = true;
9230          ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9231       } else {
9232          /* uniform continue - directly jump to the loop header */
9233          ctx->block->kind |= block_kind_uniform;
9234          ctx->cf_info.has_branch = true;
9235          bld.branch(aco_opcode::p_branch);
9236          add_linear_edge(idx, logical_target);
9237          return;
9238       }
9239       break;
9240    default:
9241       fprintf(stderr, "Unknown NIR jump instr: ");
9242       nir_print_instr(&instr->instr, stderr);
9243       fprintf(stderr, "\n");
9244       abort();
9245    }
9246
9247    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
9248       ctx->cf_info.exec_potentially_empty_break = true;
9249       ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth;
9250    }
9251
9252    /* remove critical edges from linear CFG */
9253    bld.branch(aco_opcode::p_branch);
9254    Block* break_block = ctx->program->create_and_insert_block();
9255    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9256    break_block->kind |= block_kind_uniform;
9257    add_linear_edge(idx, break_block);
9258    /* the loop_header pointer might be invalidated by this point */
9259    if (instr->type == nir_jump_continue)
9260       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9261    add_linear_edge(break_block->index, logical_target);
9262    bld.reset(break_block);
9263    bld.branch(aco_opcode::p_branch);
9264
9265    Block* continue_block = ctx->program->create_and_insert_block();
9266    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9267    add_linear_edge(idx, continue_block);
9268    append_logical_start(continue_block);
9269    ctx->block = continue_block;
9270    return;
9271 }
9272
9273 void visit_block(isel_context *ctx, nir_block *block)
9274 {
9275    nir_foreach_instr(instr, block) {
9276       switch (instr->type) {
9277       case nir_instr_type_alu:
9278          visit_alu_instr(ctx, nir_instr_as_alu(instr));
9279          break;
9280       case nir_instr_type_load_const:
9281          visit_load_const(ctx, nir_instr_as_load_const(instr));
9282          break;
9283       case nir_instr_type_intrinsic:
9284          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
9285          break;
9286       case nir_instr_type_tex:
9287          visit_tex(ctx, nir_instr_as_tex(instr));
9288          break;
9289       case nir_instr_type_phi:
9290          visit_phi(ctx, nir_instr_as_phi(instr));
9291          break;
9292       case nir_instr_type_ssa_undef:
9293          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
9294          break;
9295       case nir_instr_type_deref:
9296          break;
9297       case nir_instr_type_jump:
9298          visit_jump(ctx, nir_instr_as_jump(instr));
9299          break;
9300       default:
9301          fprintf(stderr, "Unknown NIR instr type: ");
9302          nir_print_instr(instr, stderr);
9303          fprintf(stderr, "\n");
9304          //abort();
9305       }
9306    }
9307
9308    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9309       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
9310 }
9311
9312
9313
9314 static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last,
9315                                     aco_ptr<Instruction>& header_phi, Operand *vals)
9316 {
9317    vals[0] = Operand(header_phi->definitions[0].getTemp());
9318    RegClass rc = vals[0].regClass();
9319
9320    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
9321
9322    unsigned next_pred = 1;
9323
9324    for (unsigned idx = first + 1; idx <= last; idx++) {
9325       Block& block = ctx->program->blocks[idx];
9326       if (block.loop_nest_depth != loop_nest_depth) {
9327          vals[idx - first] = vals[idx - 1 - first];
9328          continue;
9329       }
9330
9331       if (block.kind & block_kind_continue) {
9332          vals[idx - first] = header_phi->operands[next_pred];
9333          next_pred++;
9334          continue;
9335       }
9336
9337       bool all_same = true;
9338       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
9339          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
9340
9341       Operand val;
9342       if (all_same) {
9343          val = vals[block.linear_preds[0] - first];
9344       } else {
9345          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
9346             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
9347          for (unsigned i = 0; i < block.linear_preds.size(); i++)
9348             phi->operands[i] = vals[block.linear_preds[i] - first];
9349          val = Operand(Temp(ctx->program->allocateId(), rc));
9350          phi->definitions[0] = Definition(val.getTemp());
9351          block.instructions.emplace(block.instructions.begin(), std::move(phi));
9352       }
9353       vals[idx - first] = val;
9354    }
9355
9356    return vals[last - first];
9357 }
9358
9359 static void visit_loop(isel_context *ctx, nir_loop *loop)
9360 {
9361    //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9362    append_logical_end(ctx->block);
9363    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9364    Builder bld(ctx->program, ctx->block);
9365    bld.branch(aco_opcode::p_branch);
9366    unsigned loop_preheader_idx = ctx->block->index;
9367
9368    Block loop_exit = Block();
9369    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9370    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9371
9372    Block* loop_header = ctx->program->create_and_insert_block();
9373    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
9374    loop_header->kind |= block_kind_loop_header;
9375    add_edge(loop_preheader_idx, loop_header);
9376    ctx->block = loop_header;
9377
9378    /* emit loop body */
9379    unsigned loop_header_idx = loop_header->index;
9380    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
9381    append_logical_start(ctx->block);
9382    bool unreachable = visit_cf_list(ctx, &loop->body);
9383
9384    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
9385    if (!ctx->cf_info.has_branch) {
9386       append_logical_end(ctx->block);
9387       if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
9388          /* Discards can result in code running with an empty exec mask.
9389           * This would result in divergent breaks not ever being taken. As a
9390           * workaround, break the loop when the loop mask is empty instead of
9391           * always continuing. */
9392          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9393          unsigned block_idx = ctx->block->index;
9394
9395          /* create helper blocks to avoid critical edges */
9396          Block *break_block = ctx->program->create_and_insert_block();
9397          break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9398          break_block->kind = block_kind_uniform;
9399          bld.reset(break_block);
9400          bld.branch(aco_opcode::p_branch);
9401          add_linear_edge(block_idx, break_block);
9402          add_linear_edge(break_block->index, &loop_exit);
9403
9404          Block *continue_block = ctx->program->create_and_insert_block();
9405          continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9406          continue_block->kind = block_kind_uniform;
9407          bld.reset(continue_block);
9408          bld.branch(aco_opcode::p_branch);
9409          add_linear_edge(block_idx, continue_block);
9410          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9411
9412          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9413             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9414          ctx->block = &ctx->program->blocks[block_idx];
9415       } else {
9416          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9417          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9418             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9419          else
9420             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9421       }
9422
9423       bld.reset(ctx->block);
9424       bld.branch(aco_opcode::p_branch);
9425    }
9426
9427    /* Fixup phis in loop header from unreachable blocks.
9428     * has_branch/has_divergent_branch also indicates if the loop ends with a
9429     * break/continue instruction, but we don't emit those if unreachable=true */
9430    if (unreachable) {
9431       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
9432       bool linear = ctx->cf_info.has_branch;
9433       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
9434       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9435          if ((logical && instr->opcode == aco_opcode::p_phi) ||
9436              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
9437             /* the last operand should be the one that needs to be removed */
9438             instr->operands.pop_back();
9439          } else if (!is_phi(instr)) {
9440             break;
9441          }
9442       }
9443    }
9444
9445    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
9446     * and the previous one shouldn't both happen at once because a break in the
9447     * merge block would get CSE'd */
9448    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
9449       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
9450       Operand vals[num_vals];
9451       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9452          if (instr->opcode == aco_opcode::p_linear_phi) {
9453             if (ctx->cf_info.has_branch)
9454                instr->operands.pop_back();
9455             else
9456                instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
9457          } else if (!is_phi(instr)) {
9458             break;
9459          }
9460       }
9461    }
9462
9463    ctx->cf_info.has_branch = false;
9464
9465    // TODO: if the loop has not a single exit, we must add one °°
9466    /* emit loop successor block */
9467    ctx->block = ctx->program->insert_block(std::move(loop_exit));
9468    append_logical_start(ctx->block);
9469
9470    #if 0
9471    // TODO: check if it is beneficial to not branch on continues
9472    /* trim linear phis in loop header */
9473    for (auto&& instr : loop_entry->instructions) {
9474       if (instr->opcode == aco_opcode::p_linear_phi) {
9475          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
9476          new_phi->definitions[0] = instr->definitions[0];
9477          for (unsigned i = 0; i < new_phi->operands.size(); i++)
9478             new_phi->operands[i] = instr->operands[i];
9479          /* check that the remaining operands are all the same */
9480          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
9481             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
9482          instr.swap(new_phi);
9483       } else if (instr->opcode == aco_opcode::p_phi) {
9484          continue;
9485       } else {
9486          break;
9487       }
9488    }
9489    #endif
9490 }
9491
9492 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
9493 {
9494    ic->cond = cond;
9495
9496    append_logical_end(ctx->block);
9497    ctx->block->kind |= block_kind_branch;
9498
9499    /* branch to linear then block */
9500    assert(cond.regClass() == ctx->program->lane_mask);
9501    aco_ptr<Pseudo_branch_instruction> branch;
9502    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
9503    branch->operands[0] = Operand(cond);
9504    ctx->block->instructions.push_back(std::move(branch));
9505
9506    ic->BB_if_idx = ctx->block->index;
9507    ic->BB_invert = Block();
9508    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9509    /* Invert blocks are intentionally not marked as top level because they
9510     * are not part of the logical cfg. */
9511    ic->BB_invert.kind |= block_kind_invert;
9512    ic->BB_endif = Block();
9513    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9514    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
9515
9516    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
9517    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
9518    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
9519    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
9520    ctx->cf_info.parent_if.is_divergent = true;
9521
9522    /* divergent branches use cbranch_execz */
9523    ctx->cf_info.exec_potentially_empty_discard = false;
9524    ctx->cf_info.exec_potentially_empty_break = false;
9525    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9526
9527    /** emit logical then block */
9528    Block* BB_then_logical = ctx->program->create_and_insert_block();
9529    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9530    add_edge(ic->BB_if_idx, BB_then_logical);
9531    ctx->block = BB_then_logical;
9532    append_logical_start(BB_then_logical);
9533 }
9534
9535 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
9536 {
9537    Block *BB_then_logical = ctx->block;
9538    append_logical_end(BB_then_logical);
9539     /* branch from logical then block to invert block */
9540    aco_ptr<Pseudo_branch_instruction> branch;
9541    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9542    BB_then_logical->instructions.emplace_back(std::move(branch));
9543    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
9544    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9545       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
9546    BB_then_logical->kind |= block_kind_uniform;
9547    assert(!ctx->cf_info.has_branch);
9548    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9549    ctx->cf_info.parent_loop.has_divergent_branch = false;
9550
9551    /** emit linear then block */
9552    Block* BB_then_linear = ctx->program->create_and_insert_block();
9553    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9554    BB_then_linear->kind |= block_kind_uniform;
9555    add_linear_edge(ic->BB_if_idx, BB_then_linear);
9556    /* branch from linear then block to invert block */
9557    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9558    BB_then_linear->instructions.emplace_back(std::move(branch));
9559    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
9560
9561    /** emit invert merge block */
9562    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
9563    ic->invert_idx = ctx->block->index;
9564
9565    /* branch to linear else block (skip else) */
9566    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
9567    branch->operands[0] = Operand(ic->cond);
9568    ctx->block->instructions.push_back(std::move(branch));
9569
9570    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
9571    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
9572    ic->exec_potentially_empty_break_depth_old =
9573       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9574    /* divergent branches use cbranch_execz */
9575    ctx->cf_info.exec_potentially_empty_discard = false;
9576    ctx->cf_info.exec_potentially_empty_break = false;
9577    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9578
9579    /** emit logical else block */
9580    Block* BB_else_logical = ctx->program->create_and_insert_block();
9581    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9582    add_logical_edge(ic->BB_if_idx, BB_else_logical);
9583    add_linear_edge(ic->invert_idx, BB_else_logical);
9584    ctx->block = BB_else_logical;
9585    append_logical_start(BB_else_logical);
9586 }
9587
9588 static void end_divergent_if(isel_context *ctx, if_context *ic)
9589 {
9590    Block *BB_else_logical = ctx->block;
9591    append_logical_end(BB_else_logical);
9592
9593    /* branch from logical else block to endif block */
9594    aco_ptr<Pseudo_branch_instruction> branch;
9595    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9596    BB_else_logical->instructions.emplace_back(std::move(branch));
9597    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
9598    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9599       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
9600    BB_else_logical->kind |= block_kind_uniform;
9601
9602    assert(!ctx->cf_info.has_branch);
9603    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9604
9605
9606    /** emit linear else block */
9607    Block* BB_else_linear = ctx->program->create_and_insert_block();
9608    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9609    BB_else_linear->kind |= block_kind_uniform;
9610    add_linear_edge(ic->invert_idx, BB_else_linear);
9611
9612    /* branch from linear else block to endif block */
9613    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9614    BB_else_linear->instructions.emplace_back(std::move(branch));
9615    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
9616
9617
9618    /** emit endif merge block */
9619    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9620    append_logical_start(ctx->block);
9621
9622
9623    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
9624    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
9625    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
9626    ctx->cf_info.exec_potentially_empty_break_depth =
9627       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9628    if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
9629        !ctx->cf_info.parent_if.is_divergent) {
9630       ctx->cf_info.exec_potentially_empty_break = false;
9631       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9632    }
9633    /* uniform control flow never has an empty exec-mask */
9634    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
9635       ctx->cf_info.exec_potentially_empty_discard = false;
9636       ctx->cf_info.exec_potentially_empty_break = false;
9637       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9638    }
9639 }
9640
9641 static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
9642 {
9643    assert(cond.regClass() == s1);
9644
9645    append_logical_end(ctx->block);
9646    ctx->block->kind |= block_kind_uniform;
9647
9648    aco_ptr<Pseudo_branch_instruction> branch;
9649    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
9650    branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
9651    branch->operands[0] = Operand(cond);
9652    branch->operands[0].setFixed(scc);
9653    ctx->block->instructions.emplace_back(std::move(branch));
9654
9655    ic->BB_if_idx = ctx->block->index;
9656    ic->BB_endif = Block();
9657    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9658    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
9659
9660    ctx->cf_info.has_branch = false;
9661    ctx->cf_info.parent_loop.has_divergent_branch = false;
9662
9663    /** emit then block */
9664    Block* BB_then = ctx->program->create_and_insert_block();
9665    BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9666    add_edge(ic->BB_if_idx, BB_then);
9667    append_logical_start(BB_then);
9668    ctx->block = BB_then;
9669 }
9670
9671 static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
9672 {
9673    Block *BB_then = ctx->block;
9674
9675    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
9676    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9677
9678    if (!ic->uniform_has_then_branch) {
9679       append_logical_end(BB_then);
9680       /* branch from then block to endif block */
9681       aco_ptr<Pseudo_branch_instruction> branch;
9682       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9683       BB_then->instructions.emplace_back(std::move(branch));
9684       add_linear_edge(BB_then->index, &ic->BB_endif);
9685       if (!ic->then_branch_divergent)
9686          add_logical_edge(BB_then->index, &ic->BB_endif);
9687       BB_then->kind |= block_kind_uniform;
9688    }
9689
9690    ctx->cf_info.has_branch = false;
9691    ctx->cf_info.parent_loop.has_divergent_branch = false;
9692
9693    /** emit else block */
9694    Block* BB_else = ctx->program->create_and_insert_block();
9695    BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9696    add_edge(ic->BB_if_idx, BB_else);
9697    append_logical_start(BB_else);
9698    ctx->block = BB_else;
9699 }
9700
9701 static void end_uniform_if(isel_context *ctx, if_context *ic)
9702 {
9703    Block *BB_else = ctx->block;
9704
9705    if (!ctx->cf_info.has_branch) {
9706       append_logical_end(BB_else);
9707       /* branch from then block to endif block */
9708       aco_ptr<Pseudo_branch_instruction> branch;
9709       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9710       BB_else->instructions.emplace_back(std::move(branch));
9711       add_linear_edge(BB_else->index, &ic->BB_endif);
9712       if (!ctx->cf_info.parent_loop.has_divergent_branch)
9713          add_logical_edge(BB_else->index, &ic->BB_endif);
9714       BB_else->kind |= block_kind_uniform;
9715    }
9716
9717    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
9718    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9719
9720    /** emit endif merge block */
9721    if (!ctx->cf_info.has_branch) {
9722       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9723       append_logical_start(ctx->block);
9724    }
9725 }
9726
9727 static bool visit_if(isel_context *ctx, nir_if *if_stmt)
9728 {
9729    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
9730    Builder bld(ctx->program, ctx->block);
9731    aco_ptr<Pseudo_branch_instruction> branch;
9732    if_context ic;
9733
9734    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
9735       /**
9736        * Uniform conditionals are represented in the following way*) :
9737        *
9738        * The linear and logical CFG:
9739        *                        BB_IF
9740        *                        /    \
9741        *       BB_THEN (logical)      BB_ELSE (logical)
9742        *                        \    /
9743        *                        BB_ENDIF
9744        *
9745        * *) Exceptions may be due to break and continue statements within loops
9746        *    If a break/continue happens within uniform control flow, it branches
9747        *    to the loop exit/entry block. Otherwise, it branches to the next
9748        *    merge block.
9749        **/
9750
9751       // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
9752       assert(cond.regClass() == ctx->program->lane_mask);
9753       cond = bool_to_scalar_condition(ctx, cond);
9754
9755       begin_uniform_if_then(ctx, &ic, cond);
9756       visit_cf_list(ctx, &if_stmt->then_list);
9757
9758       begin_uniform_if_else(ctx, &ic);
9759       visit_cf_list(ctx, &if_stmt->else_list);
9760
9761       end_uniform_if(ctx, &ic);
9762    } else { /* non-uniform condition */
9763       /**
9764        * To maintain a logical and linear CFG without critical edges,
9765        * non-uniform conditionals are represented in the following way*) :
9766        *
9767        * The linear CFG:
9768        *                        BB_IF
9769        *                        /    \
9770        *       BB_THEN (logical)      BB_THEN (linear)
9771        *                        \    /
9772        *                        BB_INVERT (linear)
9773        *                        /    \
9774        *       BB_ELSE (logical)      BB_ELSE (linear)
9775        *                        \    /
9776        *                        BB_ENDIF
9777        *
9778        * The logical CFG:
9779        *                        BB_IF
9780        *                        /    \
9781        *       BB_THEN (logical)      BB_ELSE (logical)
9782        *                        \    /
9783        *                        BB_ENDIF
9784        *
9785        * *) Exceptions may be due to break and continue statements within loops
9786        **/
9787
9788       begin_divergent_if_then(ctx, &ic, cond);
9789       visit_cf_list(ctx, &if_stmt->then_list);
9790
9791       begin_divergent_if_else(ctx, &ic);
9792       visit_cf_list(ctx, &if_stmt->else_list);
9793
9794       end_divergent_if(ctx, &ic);
9795    }
9796
9797    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
9798 }
9799
9800 static bool visit_cf_list(isel_context *ctx,
9801                           struct exec_list *list)
9802 {
9803    foreach_list_typed(nir_cf_node, node, node, list) {
9804       switch (node->type) {
9805       case nir_cf_node_block:
9806          visit_block(ctx, nir_cf_node_as_block(node));
9807          break;
9808       case nir_cf_node_if:
9809          if (!visit_if(ctx, nir_cf_node_as_if(node)))
9810             return true;
9811          break;
9812       case nir_cf_node_loop:
9813          visit_loop(ctx, nir_cf_node_as_loop(node));
9814          break;
9815       default:
9816          unreachable("unimplemented cf list type");
9817       }
9818    }
9819    return false;
9820 }
9821
9822 static void create_null_export(isel_context *ctx)
9823 {
9824    /* Some shader stages always need to have exports.
9825     * So when there is none, we need to add a null export.
9826     */
9827
9828    unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
9829    bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10;
9830    Builder bld(ctx->program, ctx->block);
9831    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
9832            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm);
9833 }
9834
9835 static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
9836 {
9837    assert(ctx->stage == vertex_vs ||
9838           ctx->stage == tess_eval_vs ||
9839           ctx->stage == gs_copy_vs ||
9840           ctx->stage == ngg_vertex_gs ||
9841           ctx->stage == ngg_tess_eval_gs);
9842
9843    int offset = (ctx->stage & sw_tes)
9844                 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
9845                 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
9846    uint64_t mask = ctx->outputs.mask[slot];
9847    if (!is_pos && !mask)
9848       return false;
9849    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
9850       return false;
9851    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9852    exp->enabled_mask = mask;
9853    for (unsigned i = 0; i < 4; ++i) {
9854       if (mask & (1 << i))
9855          exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
9856       else
9857          exp->operands[i] = Operand(v1);
9858    }
9859    /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
9860     * Setting valid_mask=1 prevents it and has no other effect.
9861     */
9862    exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
9863    exp->done = false;
9864    exp->compressed = false;
9865    if (is_pos)
9866       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9867    else
9868       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
9869    ctx->block->instructions.emplace_back(std::move(exp));
9870
9871    return true;
9872 }
9873
9874 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
9875 {
9876    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9877    exp->enabled_mask = 0;
9878    for (unsigned i = 0; i < 4; ++i)
9879       exp->operands[i] = Operand(v1);
9880    if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
9881       exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
9882       exp->enabled_mask |= 0x1;
9883    }
9884    if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
9885       exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
9886       exp->enabled_mask |= 0x4;
9887    }
9888    if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
9889       if (ctx->options->chip_class < GFX9) {
9890          exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
9891          exp->enabled_mask |= 0x8;
9892       } else {
9893          Builder bld(ctx->program, ctx->block);
9894
9895          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
9896                              Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
9897          if (exp->operands[2].isTemp())
9898             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
9899
9900          exp->operands[2] = Operand(out);
9901          exp->enabled_mask |= 0x4;
9902       }
9903    }
9904    exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
9905    exp->done = false;
9906    exp->compressed = false;
9907    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9908    ctx->block->instructions.emplace_back(std::move(exp));
9909 }
9910
9911 static void create_export_phis(isel_context *ctx)
9912 {
9913    /* Used when exports are needed, but the output temps are defined in a preceding block.
9914     * This function will set up phis in order to access the outputs in the next block.
9915     */
9916
9917    assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start);
9918    aco_ptr<Instruction> logical_start = aco_ptr<Instruction>(ctx->block->instructions.back().release());
9919    ctx->block->instructions.pop_back();
9920
9921    Builder bld(ctx->program, ctx->block);
9922
9923    for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) {
9924       uint64_t mask = ctx->outputs.mask[slot];
9925       for (unsigned i = 0; i < 4; ++i) {
9926          if (!(mask & (1 << i)))
9927             continue;
9928
9929          Temp old = ctx->outputs.temps[slot * 4 + i];
9930          Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1));
9931          ctx->outputs.temps[slot * 4 + i] = phi;
9932       }
9933    }
9934
9935    bld.insert(std::move(logical_start));
9936 }
9937
9938 static void create_vs_exports(isel_context *ctx)
9939 {
9940    assert(ctx->stage == vertex_vs ||
9941           ctx->stage == tess_eval_vs ||
9942           ctx->stage == gs_copy_vs ||
9943           ctx->stage == ngg_vertex_gs ||
9944           ctx->stage == ngg_tess_eval_gs);
9945
9946    radv_vs_output_info *outinfo = (ctx->stage & sw_tes)
9947                                   ? &ctx->program->info->tes.outinfo
9948                                   : &ctx->program->info->vs.outinfo;
9949
9950    if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
9951       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
9952       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
9953    }
9954
9955    if (ctx->options->key.has_multiview_view_index) {
9956       ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
9957       ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
9958    }
9959
9960    /* the order these position exports are created is important */
9961    int next_pos = 0;
9962    bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
9963    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
9964       export_vs_psiz_layer_viewport(ctx, &next_pos);
9965       exported_pos = true;
9966    }
9967    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9968       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
9969    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9970       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
9971
9972    if (ctx->export_clip_dists) {
9973       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9974          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
9975       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9976          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
9977    }
9978
9979    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
9980       if (i < VARYING_SLOT_VAR0 &&
9981           i != VARYING_SLOT_LAYER &&
9982           i != VARYING_SLOT_PRIMITIVE_ID &&
9983           i != VARYING_SLOT_VIEWPORT)
9984          continue;
9985
9986       export_vs_varying(ctx, i, false, NULL);
9987    }
9988
9989    if (!exported_pos)
9990       create_null_export(ctx);
9991 }
9992
9993 static bool export_fs_mrt_z(isel_context *ctx)
9994 {
9995    Builder bld(ctx->program, ctx->block);
9996    unsigned enabled_channels = 0;
9997    bool compr = false;
9998    Operand values[4];
9999
10000    for (unsigned i = 0; i < 4; ++i) {
10001       values[i] = Operand(v1);
10002    }
10003
10004    /* Both stencil and sample mask only need 16-bits. */
10005    if (!ctx->program->info->ps.writes_z &&
10006        (ctx->program->info->ps.writes_stencil ||
10007         ctx->program->info->ps.writes_sample_mask)) {
10008       compr = true; /* COMPR flag */
10009
10010       if (ctx->program->info->ps.writes_stencil) {
10011          /* Stencil should be in X[23:16]. */
10012          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10013          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
10014          enabled_channels |= 0x3;
10015       }
10016
10017       if (ctx->program->info->ps.writes_sample_mask) {
10018          /* SampleMask should be in Y[15:0]. */
10019          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10020          enabled_channels |= 0xc;
10021      }
10022    } else {
10023       if (ctx->program->info->ps.writes_z) {
10024          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10025          enabled_channels |= 0x1;
10026       }
10027
10028       if (ctx->program->info->ps.writes_stencil) {
10029          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10030          enabled_channels |= 0x2;
10031       }
10032
10033       if (ctx->program->info->ps.writes_sample_mask) {
10034          values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10035          enabled_channels |= 0x4;
10036       }
10037    }
10038
10039    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10040     * writemask component.
10041     */
10042    if (ctx->options->chip_class == GFX6 &&
10043        ctx->options->family != CHIP_OLAND &&
10044        ctx->options->family != CHIP_HAINAN) {
10045             enabled_channels |= 0x1;
10046    }
10047
10048    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
10049            enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
10050
10051    return true;
10052 }
10053
10054 static bool export_fs_mrt_color(isel_context *ctx, int slot)
10055 {
10056    Builder bld(ctx->program, ctx->block);
10057    unsigned write_mask = ctx->outputs.mask[slot];
10058    Operand values[4];
10059
10060    for (unsigned i = 0; i < 4; ++i) {
10061       if (write_mask & (1 << i)) {
10062          values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10063       } else {
10064          values[i] = Operand(v1);
10065       }
10066    }
10067
10068    unsigned target, col_format;
10069    unsigned enabled_channels = 0;
10070    aco_opcode compr_op = (aco_opcode)0;
10071
10072    slot -= FRAG_RESULT_DATA0;
10073    target = V_008DFC_SQ_EXP_MRT + slot;
10074    col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
10075
10076    bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
10077    bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
10078    bool is_16bit = values[0].regClass() == v2b;
10079
10080    switch (col_format)
10081    {
10082    case V_028714_SPI_SHADER_ZERO:
10083       enabled_channels = 0; /* writemask */
10084       target = V_008DFC_SQ_EXP_NULL;
10085       break;
10086
10087    case V_028714_SPI_SHADER_32_R:
10088       enabled_channels = 1;
10089       break;
10090
10091    case V_028714_SPI_SHADER_32_GR:
10092       enabled_channels = 0x3;
10093       break;
10094
10095    case V_028714_SPI_SHADER_32_AR:
10096       if (ctx->options->chip_class >= GFX10) {
10097          /* Special case: on GFX10, the outputs are different for 32_AR */
10098          enabled_channels = 0x3;
10099          values[1] = values[3];
10100          values[3] = Operand(v1);
10101       } else {
10102          enabled_channels = 0x9;
10103       }
10104       break;
10105
10106    case V_028714_SPI_SHADER_FP16_ABGR:
10107       enabled_channels = 0x5;
10108       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
10109       if (is_16bit) {
10110          if (ctx->options->chip_class >= GFX9) {
10111             /* Pack the FP16 values together instead of converting them to
10112              * FP32 and back to FP16.
10113              * TODO: use p_create_vector and let the compiler optimizes.
10114              */
10115             compr_op = aco_opcode::v_pack_b32_f16;
10116          } else {
10117             for (unsigned i = 0; i < 4; i++) {
10118                if ((write_mask >> i) & 1)
10119                   values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
10120             }
10121          }
10122       }
10123       break;
10124
10125    case V_028714_SPI_SHADER_UNORM16_ABGR:
10126       enabled_channels = 0x5;
10127       if (is_16bit && ctx->options->chip_class >= GFX9) {
10128          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10129       } else {
10130          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10131       }
10132       break;
10133
10134    case V_028714_SPI_SHADER_SNORM16_ABGR:
10135       enabled_channels = 0x5;
10136       if (is_16bit && ctx->options->chip_class >= GFX9) {
10137          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10138       } else {
10139          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10140       }
10141       break;
10142
10143    case V_028714_SPI_SHADER_UINT16_ABGR: {
10144       enabled_channels = 0x5;
10145       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10146       if (is_int8 || is_int10) {
10147          /* clamp */
10148          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10149          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10150
10151          for (unsigned i = 0; i < 4; i++) {
10152             if ((write_mask >> i) & 1) {
10153                values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10154                                     i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
10155                                     values[i]);
10156             }
10157          }
10158       } else if (is_16bit) {
10159          for (unsigned i = 0; i < 4; i++) {
10160             if ((write_mask >> i) & 1) {
10161                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10162                values[i] = Operand(tmp);
10163             }
10164          }
10165       }
10166       break;
10167    }
10168
10169    case V_028714_SPI_SHADER_SINT16_ABGR:
10170       enabled_channels = 0x5;
10171       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10172       if (is_int8 || is_int10) {
10173          /* clamp */
10174          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10175          uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
10176          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10177          Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
10178
10179          for (unsigned i = 0; i < 4; i++) {
10180             if ((write_mask >> i) & 1) {
10181                values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
10182                                     i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
10183                                     values[i]);
10184                values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
10185                                     i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
10186                                     values[i]);
10187             }
10188          }
10189       } else if (is_16bit) {
10190          for (unsigned i = 0; i < 4; i++) {
10191             if ((write_mask >> i) & 1) {
10192                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10193                values[i] = Operand(tmp);
10194             }
10195          }
10196       }
10197       break;
10198
10199    case V_028714_SPI_SHADER_32_ABGR:
10200       enabled_channels = 0xF;
10201       break;
10202
10203    default:
10204       break;
10205    }
10206
10207    if (target == V_008DFC_SQ_EXP_NULL)
10208       return false;
10209
10210    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10211    if (ctx->options->enable_mrt_output_nan_fixup &&
10212        !is_16bit &&
10213        (col_format == V_028714_SPI_SHADER_32_R ||
10214         col_format == V_028714_SPI_SHADER_32_GR ||
10215         col_format == V_028714_SPI_SHADER_32_AR ||
10216         col_format == V_028714_SPI_SHADER_32_ABGR ||
10217         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10218       for (int i = 0; i < 4; i++) {
10219          if (!(write_mask & (1 << i)))
10220             continue;
10221
10222          Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
10223                                bld.hint_vcc(bld.def(bld.lm)), values[i],
10224                                bld.copy(bld.def(v1), Operand(3u)));
10225          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10226                               bld.copy(bld.def(v1), Operand(0u)), isnan);
10227       }
10228    }
10229
10230    if ((bool) compr_op) {
10231       for (int i = 0; i < 2; i++) {
10232          /* check if at least one of the values to be compressed is enabled */
10233          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
10234          if (enabled) {
10235             enabled_channels |= enabled << (i*2);
10236             values[i] = bld.vop3(compr_op, bld.def(v1),
10237                                  values[i*2].isUndefined() ? Operand(0u) : values[i*2],
10238                                  values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
10239          } else {
10240             values[i] = Operand(v1);
10241          }
10242       }
10243       values[2] = Operand(v1);
10244       values[3] = Operand(v1);
10245    } else {
10246       for (int i = 0; i < 4; i++)
10247          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10248    }
10249
10250    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
10251            enabled_channels, target, (bool) compr_op);
10252    return true;
10253 }
10254
10255 static void create_fs_exports(isel_context *ctx)
10256 {
10257    bool exported = false;
10258
10259    /* Export depth, stencil and sample mask. */
10260    if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
10261        ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
10262        ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10263       exported |= export_fs_mrt_z(ctx);
10264
10265    /* Export all color render targets. */
10266    for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
10267       if (ctx->outputs.mask[i])
10268          exported |= export_fs_mrt_color(ctx, i);
10269
10270    if (!exported)
10271       create_null_export(ctx);
10272 }
10273
10274 static void write_tcs_tess_factors(isel_context *ctx)
10275 {
10276    unsigned outer_comps;
10277    unsigned inner_comps;
10278
10279    switch (ctx->args->options->key.tcs.primitive_mode) {
10280    case GL_ISOLINES:
10281       outer_comps = 2;
10282       inner_comps = 0;
10283       break;
10284    case GL_TRIANGLES:
10285       outer_comps = 3;
10286       inner_comps = 1;
10287       break;
10288    case GL_QUADS:
10289       outer_comps = 4;
10290       inner_comps = 2;
10291       break;
10292    default:
10293       return;
10294    }
10295
10296    Builder bld(ctx->program, ctx->block);
10297
10298    bld.barrier(aco_opcode::p_memory_barrier_shared);
10299    if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
10300       bld.sopp(aco_opcode::s_barrier);
10301
10302    Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
10303    Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
10304
10305    Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id);
10306    if_context ic_invocation_id_is_zero;
10307    begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero);
10308    bld.reset(ctx->block);
10309
10310    Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u));
10311
10312    std::pair<Temp, unsigned> lds_base = get_tcs_output_lds_offset(ctx);
10313    unsigned stride = inner_comps + outer_comps;
10314    unsigned lds_align = calculate_lds_alignment(ctx, lds_base.second);
10315    Temp tf_inner_vec;
10316    Temp tf_outer_vec;
10317    Temp out[6];
10318    assert(stride <= (sizeof(out) / sizeof(Temp)));
10319
10320    if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
10321       // LINES reversal
10322       tf_outer_vec = load_lds(ctx, 4, bld.tmp(v2), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10323       out[1] = emit_extract_vector(ctx, tf_outer_vec, 0, v1);
10324       out[0] = emit_extract_vector(ctx, tf_outer_vec, 1, v1);
10325    } else {
10326       tf_outer_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, outer_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10327       tf_inner_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, inner_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_in_loc, lds_align);
10328
10329       for (unsigned i = 0; i < outer_comps; ++i)
10330          out[i] = emit_extract_vector(ctx, tf_outer_vec, i, v1);
10331       for (unsigned i = 0; i < inner_comps; ++i)
10332          out[outer_comps + i] = emit_extract_vector(ctx, tf_inner_vec, i, v1);
10333    }
10334
10335    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
10336    Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset);
10337    Temp byte_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, stride * 4u);
10338    unsigned tf_const_offset = 0;
10339
10340    if (ctx->program->chip_class <= GFX8) {
10341       Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id);
10342       if_context ic_rel_patch_id_is_zero;
10343       begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero);
10344       bld.reset(ctx->block);
10345
10346       /* Store the dynamic HS control word. */
10347       Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
10348       bld.mubuf(aco_opcode::buffer_store_dword,
10349                 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
10350                 /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
10351                 /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
10352       tf_const_offset += 4;
10353
10354       begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
10355       end_divergent_if(ctx, &ic_rel_patch_id_is_zero);
10356       bld.reset(ctx->block);
10357    }
10358
10359    assert(stride == 2 || stride == 4 || stride == 6);
10360    Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
10361    store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
10362
10363    /* Store to offchip for TES to read - only if TES reads them */
10364    if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
10365       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
10366       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
10367
10368       std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
10369       store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
10370
10371       if (likely(inner_comps)) {
10372          std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
10373          store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
10374       }
10375    }
10376
10377    begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
10378    end_divergent_if(ctx, &ic_invocation_id_is_zero);
10379 }
10380
10381 static void emit_stream_output(isel_context *ctx,
10382                                Temp const *so_buffers,
10383                                Temp const *so_write_offset,
10384                                const struct radv_stream_output *output)
10385 {
10386    unsigned num_comps = util_bitcount(output->component_mask);
10387    unsigned writemask = (1 << num_comps) - 1;
10388    unsigned loc = output->location;
10389    unsigned buf = output->buffer;
10390
10391    assert(num_comps && num_comps <= 4);
10392    if (!num_comps || num_comps > 4)
10393       return;
10394
10395    unsigned start = ffs(output->component_mask) - 1;
10396
10397    Temp out[4];
10398    bool all_undef = true;
10399    assert(ctx->stage & hw_vs);
10400    for (unsigned i = 0; i < num_comps; i++) {
10401       out[i] = ctx->outputs.temps[loc * 4 + start + i];
10402       all_undef = all_undef && !out[i].id();
10403    }
10404    if (all_undef)
10405       return;
10406
10407    while (writemask) {
10408       int start, count;
10409       u_bit_scan_consecutive_range(&writemask, &start, &count);
10410       if (count == 3 && ctx->options->chip_class == GFX6) {
10411          /* GFX6 doesn't support storing vec3, split it. */
10412          writemask |= 1u << (start + 2);
10413          count = 2;
10414       }
10415
10416       unsigned offset = output->offset + start * 4;
10417
10418       Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
10419       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
10420       for (int i = 0; i < count; ++i)
10421          vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
10422       vec->definitions[0] = Definition(write_data);
10423       ctx->block->instructions.emplace_back(std::move(vec));
10424
10425       aco_opcode opcode;
10426       switch (count) {
10427       case 1:
10428          opcode = aco_opcode::buffer_store_dword;
10429          break;
10430       case 2:
10431          opcode = aco_opcode::buffer_store_dwordx2;
10432          break;
10433       case 3:
10434          opcode = aco_opcode::buffer_store_dwordx3;
10435          break;
10436       case 4:
10437          opcode = aco_opcode::buffer_store_dwordx4;
10438          break;
10439       default:
10440          unreachable("Unsupported dword count.");
10441       }
10442
10443       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
10444       store->operands[0] = Operand(so_buffers[buf]);
10445       store->operands[1] = Operand(so_write_offset[buf]);
10446       store->operands[2] = Operand((uint32_t) 0);
10447       store->operands[3] = Operand(write_data);
10448       if (offset > 4095) {
10449          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
10450          Builder bld(ctx->program, ctx->block);
10451          store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
10452       } else {
10453          store->offset = offset;
10454       }
10455       store->offen = true;
10456       store->glc = true;
10457       store->dlc = false;
10458       store->slc = true;
10459       store->can_reorder = true;
10460       ctx->block->instructions.emplace_back(std::move(store));
10461    }
10462 }
10463
10464 static void emit_streamout(isel_context *ctx, unsigned stream)
10465 {
10466    Builder bld(ctx->program, ctx->block);
10467
10468    Temp so_buffers[4];
10469    Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
10470    for (unsigned i = 0; i < 4; i++) {
10471       unsigned stride = ctx->program->info->so.strides[i];
10472       if (!stride)
10473          continue;
10474
10475       Operand off = bld.copy(bld.def(s1), Operand(i * 16u));
10476       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
10477    }
10478
10479    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10480                                 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
10481
10482    Temp tid = emit_mbcnt(ctx, bld.def(v1));
10483
10484    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
10485
10486    if_context ic;
10487    begin_divergent_if_then(ctx, &ic, can_emit);
10488
10489    bld.reset(ctx->block);
10490
10491    Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
10492
10493    Temp so_write_offset[4];
10494
10495    for (unsigned i = 0; i < 4; i++) {
10496       unsigned stride = ctx->program->info->so.strides[i];
10497       if (!stride)
10498          continue;
10499
10500       if (stride == 1) {
10501          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10502                                 get_arg(ctx, ctx->args->streamout_write_idx),
10503                                 get_arg(ctx, ctx->args->streamout_offset[i]));
10504          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
10505
10506          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
10507       } else {
10508          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
10509          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
10510                                  get_arg(ctx, ctx->args->streamout_offset[i]));
10511          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
10512       }
10513    }
10514
10515    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
10516       struct radv_stream_output *output =
10517          &ctx->program->info->so.outputs[i];
10518       if (stream != output->stream)
10519          continue;
10520
10521       emit_stream_output(ctx, so_buffers, so_write_offset, output);
10522    }
10523
10524    begin_divergent_if_else(ctx, &ic);
10525    end_divergent_if(ctx, &ic);
10526 }
10527
10528 } /* end namespace */
10529
10530 void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
10531 {
10532    assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
10533    Builder bld(ctx->program, ctx->block);
10534    constexpr unsigned hs_idx = 1u;
10535    Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10536                                               get_arg(ctx, ctx->args->merged_wave_info),
10537                                               Operand((8u << 16) | (hs_idx * 8u)));
10538    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
10539
10540    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
10541
10542    Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10543                                get_arg(ctx, ctx->args->rel_auto_id),
10544                                get_arg(ctx, ctx->args->ac.instance_id),
10545                                ls_has_nonzero_hs_threads);
10546    Temp rel_auto_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10547                                get_arg(ctx, ctx->args->ac.tcs_rel_ids),
10548                                get_arg(ctx, ctx->args->rel_auto_id),
10549                                ls_has_nonzero_hs_threads);
10550    Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10551                              get_arg(ctx, ctx->args->ac.tcs_patch_id),
10552                              get_arg(ctx, ctx->args->ac.vertex_id),
10553                              ls_has_nonzero_hs_threads);
10554
10555    ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
10556    ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id;
10557    ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
10558 }
10559
10560 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
10561 {
10562    /* Split all arguments except for the first (ring_offsets) and the last
10563     * (exec) so that the dead channels don't stay live throughout the program.
10564     */
10565    for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
10566       if (startpgm->definitions[i].regClass().size() > 1) {
10567          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
10568                            startpgm->definitions[i].regClass().size());
10569       }
10570    }
10571 }
10572
10573 void handle_bc_optimize(isel_context *ctx)
10574 {
10575    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
10576    Builder bld(ctx->program, ctx->block);
10577    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
10578    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
10579    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
10580    ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
10581    ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
10582    if (uses_center && uses_centroid) {
10583       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
10584                               get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
10585
10586       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
10587          Temp new_coord[2];
10588          for (unsigned i = 0; i < 2; i++) {
10589             Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
10590             Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
10591             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10592                                     persp_centroid, persp_center, sel);
10593          }
10594          ctx->persp_centroid = bld.tmp(v2);
10595          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
10596                     Operand(new_coord[0]), Operand(new_coord[1]));
10597          emit_split_vector(ctx, ctx->persp_centroid, 2);
10598       }
10599
10600       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
10601          Temp new_coord[2];
10602          for (unsigned i = 0; i < 2; i++) {
10603             Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
10604             Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
10605             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10606                                     linear_centroid, linear_center, sel);
10607          }
10608          ctx->linear_centroid = bld.tmp(v2);
10609          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
10610                     Operand(new_coord[0]), Operand(new_coord[1]));
10611          emit_split_vector(ctx, ctx->linear_centroid, 2);
10612       }
10613    }
10614 }
10615
10616 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
10617 {
10618    Program *program = ctx->program;
10619
10620    unsigned float_controls = shader->info.float_controls_execution_mode;
10621
10622    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
10623       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
10624    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
10625       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
10626                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
10627
10628    program->next_fp_mode.must_flush_denorms32 =
10629       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
10630    program->next_fp_mode.must_flush_denorms16_64 =
10631       float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
10632                         FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
10633
10634    program->next_fp_mode.care_about_round32 =
10635       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
10636
10637    program->next_fp_mode.care_about_round16_64 =
10638       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
10639                         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
10640
10641    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
10642     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
10643    if (program->next_fp_mode.must_flush_denorms16_64)
10644       program->next_fp_mode.denorm16_64 = 0;
10645    else
10646       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
10647
10648    /* preserving fp32 denorms is expensive, so only do it if asked */
10649    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
10650       program->next_fp_mode.denorm32 = fp_denorm_keep;
10651    else
10652       program->next_fp_mode.denorm32 = 0;
10653
10654    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
10655       program->next_fp_mode.round32 = fp_round_tz;
10656    else
10657       program->next_fp_mode.round32 = fp_round_ne;
10658
10659    if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
10660       program->next_fp_mode.round16_64 = fp_round_tz;
10661    else
10662       program->next_fp_mode.round16_64 = fp_round_ne;
10663
10664    ctx->block->fp_mode = program->next_fp_mode;
10665 }
10666
10667 void cleanup_cfg(Program *program)
10668 {
10669    /* create linear_succs/logical_succs */
10670    for (Block& BB : program->blocks) {
10671       for (unsigned idx : BB.linear_preds)
10672          program->blocks[idx].linear_succs.emplace_back(BB.index);
10673       for (unsigned idx : BB.logical_preds)
10674          program->blocks[idx].logical_succs.emplace_back(BB.index);
10675    }
10676 }
10677
10678 Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
10679 {
10680    Builder bld(ctx->program, ctx->block);
10681
10682    /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
10683    Temp count = i == 0
10684                 ? get_arg(ctx, ctx->args->merged_wave_info)
10685                 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
10686                            get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u));
10687
10688    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
10689    Temp cond;
10690
10691    if (ctx->program->wave_size == 64) {
10692       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
10693       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
10694       cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
10695    } else {
10696       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
10697       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
10698    }
10699
10700    return cond;
10701 }
10702
10703 bool ngg_early_prim_export(isel_context *ctx)
10704 {
10705    /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */
10706    return true;
10707 }
10708
10709 void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx)
10710 {
10711    Builder bld(ctx->program, ctx->block);
10712
10713    /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */
10714    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
10715
10716    /* Get the id of the current wave within the threadgroup (workgroup) */
10717    Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10718                                             get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10719
10720    /* Execute the following code only on the first wave (wave id 0),
10721     * use the SCC def to tell if the wave id is zero or not.
10722     */
10723    Temp cond = wave_id_in_tg.def(1).getTemp();
10724    if_context ic;
10725    begin_uniform_if_then(ctx, &ic, cond);
10726    begin_uniform_if_else(ctx, &ic);
10727    bld.reset(ctx->block);
10728
10729    /* Number of vertices output by VS/TES */
10730    Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10731                            get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u)));
10732    /* Number of primitives output by VS/TES */
10733    Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10734                            get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u)));
10735
10736    /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
10737    Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
10738    tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
10739
10740    /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
10741    bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
10742
10743    end_uniform_if(ctx, &ic);
10744
10745    /* After the GS_ALLOC_REQ is done, reset priority to default (0). */
10746    bld.reset(ctx->block);
10747    bld.sopp(aco_opcode::s_setprio, -1u, 0x0u);
10748 }
10749
10750 Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[])
10751 {
10752    Builder bld(ctx->program, ctx->block);
10753
10754    if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
10755       return get_arg(ctx, ctx->args->gs_vtx_offset[0]);
10756    }
10757
10758    Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
10759    Temp tmp;
10760
10761    for (unsigned i = 0; i < num_vertices; ++i) {
10762       assert(vtxindex[i].id());
10763
10764       if (i)
10765          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
10766       else
10767          tmp = vtxindex[i];
10768
10769       /* The initial edge flag is always false in tess eval shaders. */
10770       if (ctx->stage == ngg_vertex_gs) {
10771          Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u));
10772          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
10773       }
10774    }
10775
10776    /* TODO: Set isnull field in case of merged NGG VS+GS. */
10777
10778    return tmp;
10779 }
10780
10781 void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[])
10782 {
10783    Builder bld(ctx->program, ctx->block);
10784    Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex);
10785
10786    bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
10787         1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
10788         false /* compressed */, true/* done */, false /* valid mask */);
10789 }
10790
10791 void ngg_emit_nogs_gsthreads(isel_context *ctx)
10792 {
10793    /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
10794     * These must always come before VS exports.
10795     *
10796     * It is recommended to do these as early as possible. They can be at the beginning when
10797     * there is no SW GS and the shader doesn't write edge flags.
10798     */
10799
10800    if_context ic;
10801    Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
10802    begin_divergent_if_then(ctx, &ic, is_gs_thread);
10803
10804    Builder bld(ctx->program, ctx->block);
10805    constexpr unsigned max_vertices_per_primitive = 3;
10806    unsigned num_vertices_per_primitive = max_vertices_per_primitive;
10807
10808    if (ctx->stage == ngg_vertex_gs) {
10809       /* TODO: optimize for points & lines */
10810    } else if (ctx->stage == ngg_tess_eval_gs) {
10811       if (ctx->shader->info.tess.point_mode)
10812          num_vertices_per_primitive = 1;
10813       else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
10814          num_vertices_per_primitive = 2;
10815    } else {
10816       unreachable("Unsupported NGG shader stage");
10817    }
10818
10819    Temp vtxindex[max_vertices_per_primitive];
10820    vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10821                           get_arg(ctx, ctx->args->gs_vtx_offset[0]));
10822    vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
10823                  bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
10824                           get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u));
10825    vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
10826                  bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10827                           get_arg(ctx, ctx->args->gs_vtx_offset[2]));
10828
10829    /* Export primitive data to the index buffer. */
10830    ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
10831
10832    /* Export primitive ID. */
10833    if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) {
10834       /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
10835       Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
10836       Temp provoking_vtx_index = vtxindex[0];
10837       Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
10838
10839       store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
10840    }
10841
10842    begin_divergent_if_else(ctx, &ic);
10843    end_divergent_if(ctx, &ic);
10844 }
10845
10846 void ngg_emit_nogs_output(isel_context *ctx)
10847 {
10848    /* Emits NGG GS output, for stages that don't have SW GS. */
10849
10850    if_context ic;
10851    Builder bld(ctx->program, ctx->block);
10852    bool late_prim_export = !ngg_early_prim_export(ctx);
10853
10854    /* NGG streamout is currently disabled by default. */
10855    assert(!ctx->args->shader_info->so.num_outputs);
10856
10857    if (late_prim_export) {
10858       /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */
10859       create_export_phis(ctx);
10860       /* Do what we need to do in the GS threads. */
10861       ngg_emit_nogs_gsthreads(ctx);
10862
10863       /* What comes next should be executed on ES threads. */
10864       Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
10865       begin_divergent_if_then(ctx, &ic, is_es_thread);
10866       bld.reset(ctx->block);
10867    }
10868
10869    /* Export VS outputs */
10870    ctx->block->kind |= block_kind_export_end;
10871    create_vs_exports(ctx);
10872
10873    /* Export primitive ID */
10874    if (ctx->args->options->key.vs_common_out.export_prim_id) {
10875       Temp prim_id;
10876
10877       if (ctx->stage == ngg_vertex_gs) {
10878          /* Wait for GS threads to store primitive ID in LDS. */
10879          bld.barrier(aco_opcode::p_memory_barrier_shared);
10880          bld.sopp(aco_opcode::s_barrier);
10881
10882          /* Calculate LDS address where the GS threads stored the primitive ID. */
10883          Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10884                                        get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10885          Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1));
10886          Temp wave_id_mul = bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size);
10887          Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave));
10888          Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
10889
10890          /* Load primitive ID from LDS. */
10891          prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
10892       } else if (ctx->stage == ngg_tess_eval_gs) {
10893          /* TES: Just use the patch ID as the primitive ID. */
10894          prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
10895       } else {
10896          unreachable("unsupported NGG shader stage.");
10897       }
10898
10899       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10900       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
10901
10902       export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
10903    }
10904
10905    if (late_prim_export) {
10906       begin_divergent_if_else(ctx, &ic);
10907       end_divergent_if(ctx, &ic);
10908       bld.reset(ctx->block);
10909    }
10910 }
10911
10912 void select_program(Program *program,
10913                     unsigned shader_count,
10914                     struct nir_shader *const *shaders,
10915                     ac_shader_config* config,
10916                     struct radv_shader_args *args)
10917 {
10918    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
10919    if_context ic_merged_wave_info;
10920    bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
10921
10922    for (unsigned i = 0; i < shader_count; i++) {
10923       nir_shader *nir = shaders[i];
10924       init_context(&ctx, nir);
10925
10926       setup_fp_mode(&ctx, nir);
10927
10928       if (!i) {
10929          /* needs to be after init_context() for FS */
10930          Pseudo_instruction *startpgm = add_startpgm(&ctx);
10931          append_logical_start(ctx.block);
10932
10933          if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
10934             fix_ls_vgpr_init_bug(&ctx, startpgm);
10935
10936          split_arguments(&ctx, startpgm);
10937       }
10938
10939       if (ngg_no_gs) {
10940          ngg_emit_sendmsg_gs_alloc_req(&ctx);
10941
10942          if (ngg_early_prim_export(&ctx))
10943             ngg_emit_nogs_gsthreads(&ctx);
10944       }
10945
10946       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
10947       nir_function_impl *func = nir_shader_get_entrypoint(nir);
10948       bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
10949                           ((nir->info.stage == MESA_SHADER_VERTEX &&
10950                             (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
10951                            (nir->info.stage == MESA_SHADER_TESS_EVAL &&
10952                             ctx.stage == tess_eval_geometry_gs));
10953
10954       bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
10955       bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
10956       if (check_merged_wave_info) {
10957          Temp cond = merged_wave_info_to_mask(&ctx, i);
10958          begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
10959       }
10960
10961       if (i) {
10962          Builder bld(ctx.program, ctx.block);
10963
10964          bld.barrier(aco_opcode::p_memory_barrier_shared);
10965          bld.sopp(aco_opcode::s_barrier);
10966
10967          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
10968             ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
10969          }
10970       } else if (ctx.stage == geometry_gs)
10971          ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id);
10972
10973       if (ctx.stage == fragment_fs)
10974          handle_bc_optimize(&ctx);
10975
10976       visit_cf_list(&ctx, &func->body);
10977
10978       if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
10979          emit_streamout(&ctx, 0);
10980
10981       if (ctx.stage & hw_vs) {
10982          create_vs_exports(&ctx);
10983          ctx.block->kind |= block_kind_export_end;
10984       } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) {
10985          ngg_emit_nogs_output(&ctx);
10986       } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
10987          Builder bld(ctx.program, ctx.block);
10988          bld.barrier(aco_opcode::p_memory_barrier_gs_data);
10989          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
10990       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
10991          write_tcs_tess_factors(&ctx);
10992       }
10993
10994       if (ctx.stage == fragment_fs) {
10995          create_fs_exports(&ctx);
10996          ctx.block->kind |= block_kind_export_end;
10997       }
10998
10999       if (endif_merged_wave_info) {
11000          begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11001          end_divergent_if(&ctx, &ic_merged_wave_info);
11002       }
11003
11004       if (ngg_no_gs && !ngg_early_prim_export(&ctx))
11005          ngg_emit_nogs_output(&ctx);
11006
11007       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11008          /* Outputs of the previous stage are inputs to the next stage */
11009          ctx.inputs = ctx.outputs;
11010          ctx.outputs = shader_io_state();
11011       }
11012    }
11013
11014    program->config->float_mode = program->blocks[0].fp_mode.val;
11015
11016    append_logical_end(ctx.block);
11017    ctx.block->kind |= block_kind_uniform;
11018    Builder bld(ctx.program, ctx.block);
11019    if (ctx.program->wb_smem_l1_on_end)
11020       bld.smem(aco_opcode::s_dcache_wb, false);
11021    bld.sopp(aco_opcode::s_endpgm);
11022
11023    cleanup_cfg(program);
11024 }
11025
11026 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
11027                            ac_shader_config* config,
11028                            struct radv_shader_args *args)
11029 {
11030    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11031
11032    ctx.block->fp_mode = program->next_fp_mode;
11033
11034    add_startpgm(&ctx);
11035    append_logical_start(ctx.block);
11036
11037    Builder bld(ctx.program, ctx.block);
11038
11039    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
11040
11041    Operand stream_id(0u);
11042    if (args->shader_info->so.num_outputs)
11043       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11044                            get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
11045
11046    Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
11047
11048    std::stack<Block> endif_blocks;
11049
11050    for (unsigned stream = 0; stream < 4; stream++) {
11051       if (stream_id.isConstant() && stream != stream_id.constantValue())
11052          continue;
11053
11054       unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11055       if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11056          continue;
11057
11058       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11059
11060       unsigned BB_if_idx = ctx.block->index;
11061       Block BB_endif = Block();
11062       if (!stream_id.isConstant()) {
11063          /* begin IF */
11064          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
11065          append_logical_end(ctx.block);
11066          ctx.block->kind |= block_kind_uniform;
11067          bld.branch(aco_opcode::p_cbranch_z, cond);
11068
11069          BB_endif.kind |= ctx.block->kind & block_kind_top_level;
11070
11071          ctx.block = ctx.program->create_and_insert_block();
11072          add_edge(BB_if_idx, ctx.block);
11073          bld.reset(ctx.block);
11074          append_logical_start(ctx.block);
11075       }
11076
11077       unsigned offset = 0;
11078       for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11079          if (args->shader_info->gs.output_streams[i] != stream)
11080             continue;
11081
11082          unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11083          unsigned length = util_last_bit(output_usage_mask);
11084          for (unsigned j = 0; j < length; ++j) {
11085             if (!(output_usage_mask & (1 << j)))
11086                continue;
11087
11088             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11089             Temp voffset = vtx_offset;
11090             if (const_offset >= 4096u) {
11091                voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
11092                const_offset %= 4096u;
11093             }
11094
11095             aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
11096             mubuf->definitions[0] = bld.def(v1);
11097             mubuf->operands[0] = Operand(gsvs_ring);
11098             mubuf->operands[1] = Operand(voffset);
11099             mubuf->operands[2] = Operand(0u);
11100             mubuf->offen = true;
11101             mubuf->offset = const_offset;
11102             mubuf->glc = true;
11103             mubuf->slc = true;
11104             mubuf->dlc = args->options->chip_class >= GFX10;
11105             mubuf->barrier = barrier_none;
11106             mubuf->can_reorder = true;
11107
11108             ctx.outputs.mask[i] |= 1 << j;
11109             ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
11110
11111             bld.insert(std::move(mubuf));
11112
11113             offset++;
11114          }
11115       }
11116
11117       if (args->shader_info->so.num_outputs) {
11118          emit_streamout(&ctx, stream);
11119          bld.reset(ctx.block);
11120       }
11121
11122       if (stream == 0) {
11123          create_vs_exports(&ctx);
11124          ctx.block->kind |= block_kind_export_end;
11125       }
11126
11127       if (!stream_id.isConstant()) {
11128          append_logical_end(ctx.block);
11129
11130          /* branch from then block to endif block */
11131          bld.branch(aco_opcode::p_branch);
11132          add_edge(ctx.block->index, &BB_endif);
11133          ctx.block->kind |= block_kind_uniform;
11134
11135          /* emit else block */
11136          ctx.block = ctx.program->create_and_insert_block();
11137          add_edge(BB_if_idx, ctx.block);
11138          bld.reset(ctx.block);
11139          append_logical_start(ctx.block);
11140
11141          endif_blocks.push(std::move(BB_endif));
11142       }
11143    }
11144
11145    while (!endif_blocks.empty()) {
11146       Block BB_endif = std::move(endif_blocks.top());
11147       endif_blocks.pop();
11148
11149       Block *BB_else = ctx.block;
11150
11151       append_logical_end(BB_else);
11152       /* branch from else block to endif block */
11153       bld.branch(aco_opcode::p_branch);
11154       add_edge(BB_else->index, &BB_endif);
11155       BB_else->kind |= block_kind_uniform;
11156
11157       /** emit endif merge block */
11158       ctx.block = program->insert_block(std::move(BB_endif));
11159       bld.reset(ctx.block);
11160       append_logical_start(ctx.block);
11161    }
11162
11163    program->config->float_mode = program->blocks[0].fp_mode.val;
11164
11165    append_logical_end(ctx.block);
11166    ctx.block->kind |= block_kind_uniform;
11167    bld.sopp(aco_opcode::s_endpgm);
11168
11169    cleanup_cfg(program);
11170 }
11171 }