src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <stack>
  29 #include <map>
  30
  31 #include "ac_shader_util.h"
  32 #include "aco_ir.h"
  33 #include "aco_builder.h"
  34 #include "aco_interface.h"
  35 #include "aco_instruction_selection_setup.cpp"
  36 #include "util/fast_idiv_by_const.h"
  37
  38 namespace aco {
  39 namespace {
  40
  41 class loop_info_RAII {
  42    isel_context* ctx;
  43    unsigned header_idx_old;
  44    Block* exit_old;
  45    bool divergent_cont_old;
  46    bool divergent_branch_old;
  47    bool divergent_if_old;
  48
  49 public:
  50    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  51       : ctx(ctx),
  52         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  53         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  54         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  55         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  56    {
  57       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  58       ctx->cf_info.parent_loop.exit = loop_exit;
  59       ctx->cf_info.parent_loop.has_divergent_continue = false;
  60       ctx->cf_info.parent_loop.has_divergent_branch = false;
  61       ctx->cf_info.parent_if.is_divergent = false;
  62       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  63    }
  64
  65    ~loop_info_RAII()
  66    {
  67       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  68       ctx->cf_info.parent_loop.exit = exit_old;
  69       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  70       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  71       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  72       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  73       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  74          ctx->cf_info.exec_potentially_empty_discard = false;
  75    }
  76 };
  77
  78 struct if_context {
  79    Temp cond;
  80
  81    bool divergent_old;
  82    bool exec_potentially_empty_discard_old;
  83    bool exec_potentially_empty_break_old;
  84    uint16_t exec_potentially_empty_break_depth_old;
  85
  86    unsigned BB_if_idx;
  87    unsigned invert_idx;
  88    bool uniform_has_then_branch;
  89    bool then_branch_divergent;
  90    Block BB_invert;
  91    Block BB_endif;
  92 };
  93
  94 static bool visit_cf_list(struct isel_context *ctx,
  95                           struct exec_list *list);
  96
  97 static void add_logical_edge(unsigned pred_idx, Block *succ)
  98 {
  99    succ->logical_preds.emplace_back(pred_idx);
 100 }
 101
 102
 103 static void add_linear_edge(unsigned pred_idx, Block *succ)
 104 {
 105    succ->linear_preds.emplace_back(pred_idx);
 106 }
 107
 108 static void add_edge(unsigned pred_idx, Block *succ)
 109 {
 110    add_logical_edge(pred_idx, succ);
 111    add_linear_edge(pred_idx, succ);
 112 }
 113
 114 static void append_logical_start(Block *b)
 115 {
 116    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 117 }
 118
 119 static void append_logical_end(Block *b)
 120 {
 121    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 122 }
 123
 124 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 125 {
 126    assert(ctx->allocated[def->index].id());
 127    return ctx->allocated[def->index];
 128 }
 129
 130 Temp emit_mbcnt(isel_context *ctx, Definition dst,
 131                 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
 132 {
 133    Builder bld(ctx->program, ctx->block);
 134    Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
 135    Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
 136
 137    if (ctx->program->wave_size == 32) {
 138       return thread_id_lo;
 139    } else if (ctx->program->chip_class <= GFX7) {
 140       Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
 141       return thread_id_hi;
 142    } else {
 143       Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
 144       return thread_id_hi;
 145    }
 146 }
 147
 148 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 149 {
 150    Builder bld(ctx->program, ctx->block);
 151
 152    if (!dst.id())
 153       dst = bld.tmp(src.regClass());
 154
 155    assert(src.size() == dst.size());
 156
 157    if (ctx->stage != fragment_fs) {
 158       if (!dst.id())
 159          return src;
 160
 161       bld.copy(Definition(dst), src);
 162       return dst;
 163    }
 164
 165    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 166    ctx->program->needs_wqm |= program_needs_wqm;
 167    return dst;
 168 }
 169
 170 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
 171 {
 172    if (index.regClass() == s1)
 173       return bld.readlane(bld.def(s1), data, index);
 174
 175    if (ctx->options->chip_class <= GFX7) {
 176       /* GFX6-7: there is no bpermute instruction */
 177       Operand index_op(index);
 178       Operand input_data(data);
 179       index_op.setLateKill(true);
 180       input_data.setLateKill(true);
 181
 182       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
 183    } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
 184       /* GFX10 wave64 mode: emulate full-wave bpermute */
 185       if (!ctx->has_gfx10_wave64_bpermute) {
 186          ctx->has_gfx10_wave64_bpermute = true;
 187          ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
 188          ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
 189       }
 190
 191       Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
 192       Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
 193       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
 194       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
 195       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 196       Operand input_data(data);
 197
 198       index_x4.setLateKill(true);
 199       input_data.setLateKill(true);
 200       same_half.setLateKill(true);
 201
 202       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
 203    } else {
 204       /* GFX8-9 or GFX10 wave32: bpermute works normally */
 205       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 206       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 207    }
 208 }
 209
 210 static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
 211 {
 212    if (ctx->options->chip_class >= GFX8) {
 213       unsigned and_mask = mask & 0x1f;
 214       unsigned or_mask = (mask >> 5) & 0x1f;
 215       unsigned xor_mask = (mask >> 10) & 0x1f;
 216
 217       uint16_t dpp_ctrl = 0xffff;
 218
 219       // TODO: we could use DPP8 for some swizzles
 220       if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
 221          unsigned res[4] = {0, 1, 2, 3};
 222          for (unsigned i = 0; i < 4; i++)
 223             res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
 224          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
 225       } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
 226          dpp_ctrl = dpp_row_rr(8);
 227       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
 228          dpp_ctrl = dpp_row_mirror;
 229       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
 230          dpp_ctrl = dpp_row_half_mirror;
 231       }
 232
 233       if (dpp_ctrl != 0xffff)
 234          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
 235    }
 236
 237    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 238 }
 239
 240 Temp as_vgpr(isel_context *ctx, Temp val)
 241 {
 242    if (val.type() == RegType::sgpr) {
 243       Builder bld(ctx->program, ctx->block);
 244       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 245    }
 246    assert(val.type() == RegType::vgpr);
 247    return val;
 248 }
 249
 250 //assumes a != 0xffffffff
 251 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 252 {
 253    assert(b != 0);
 254    Builder bld(ctx->program, ctx->block);
 255
 256    if (util_is_power_of_two_or_zero(b)) {
 257       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 258       return;
 259    }
 260
 261    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 262
 263    assert(info.multiplier <= 0xffffffff);
 264
 265    bool pre_shift = info.pre_shift != 0;
 266    bool increment = info.increment != 0;
 267    bool multiply = true;
 268    bool post_shift = info.post_shift != 0;
 269
 270    if (!pre_shift && !increment && !multiply && !post_shift) {
 271       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 272       return;
 273    }
 274
 275    Temp pre_shift_dst = a;
 276    if (pre_shift) {
 277       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 278       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 279    }
 280
 281    Temp increment_dst = pre_shift_dst;
 282    if (increment) {
 283       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 284       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 285    }
 286
 287    Temp multiply_dst = increment_dst;
 288    if (multiply) {
 289       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 290       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 291                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 292    }
 293
 294    if (post_shift) {
 295       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 296    }
 297 }
 298
 299 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 300 {
 301    Builder bld(ctx->program, ctx->block);
 302    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 303 }
 304
 305
 306 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 307 {
 308    /* no need to extract the whole vector */
 309    if (src.regClass() == dst_rc) {
 310       assert(idx == 0);
 311       return src;
 312    }
 313
 314    assert(src.bytes() > (idx * dst_rc.bytes()));
 315    Builder bld(ctx->program, ctx->block);
 316    auto it = ctx->allocated_vec.find(src.id());
 317    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
 318       if (it->second[idx].regClass() == dst_rc) {
 319          return it->second[idx];
 320       } else {
 321          assert(!dst_rc.is_subdword());
 322          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 323          return bld.copy(bld.def(dst_rc), it->second[idx]);
 324       }
 325    }
 326
 327    if (dst_rc.is_subdword())
 328       src = as_vgpr(ctx, src);
 329
 330    if (src.bytes() == dst_rc.bytes()) {
 331       assert(idx == 0);
 332       return bld.copy(bld.def(dst_rc), src);
 333    } else {
 334       Temp dst = bld.tmp(dst_rc);
 335       emit_extract_vector(ctx, src, idx, dst);
 336       return dst;
 337    }
 338 }
 339
 340 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 341 {
 342    if (num_components == 1)
 343       return;
 344    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 345       return;
 346    RegClass rc;
 347    if (num_components > vec_src.size()) {
 348       if (vec_src.type() == RegType::sgpr) {
 349          /* should still help get_alu_src() */
 350          emit_split_vector(ctx, vec_src, vec_src.size());
 351          return;
 352       }
 353       /* sub-dword split */
 354       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
 355    } else {
 356       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
 357    }
 358    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 359    split->operands[0] = Operand(vec_src);
 360    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 361    for (unsigned i = 0; i < num_components; i++) {
 362       elems[i] = {ctx->program->allocateId(), rc};
 363       split->definitions[i] = Definition(elems[i]);
 364    }
 365    ctx->block->instructions.emplace_back(std::move(split));
 366    ctx->allocated_vec.emplace(vec_src.id(), elems);
 367 }
 368
 369 /* This vector expansion uses a mask to determine which elements in the new vector
 370  * come from the original vector. The other elements are undefined. */
 371 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 372 {
 373    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 374
 375    if (vec_src == dst)
 376       return;
 377
 378    Builder bld(ctx->program, ctx->block);
 379    if (num_components == 1) {
 380       if (dst.type() == RegType::sgpr)
 381          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 382       else
 383          bld.copy(Definition(dst), vec_src);
 384       return;
 385    }
 386
 387    unsigned component_size = dst.size() / num_components;
 388    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 389
 390    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 391    vec->definitions[0] = Definition(dst);
 392    unsigned k = 0;
 393    for (unsigned i = 0; i < num_components; i++) {
 394       if (mask & (1 << i)) {
 395          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 396          if (dst.type() == RegType::sgpr)
 397             src = bld.as_uniform(src);
 398          vec->operands[i] = Operand(src);
 399       } else {
 400          vec->operands[i] = Operand(0u);
 401       }
 402       elems[i] = vec->operands[i].getTemp();
 403    }
 404    ctx->block->instructions.emplace_back(std::move(vec));
 405    ctx->allocated_vec.emplace(dst.id(), elems);
 406 }
 407
 408 /* adjust misaligned small bit size loads */
 409 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
 410 {
 411    Builder bld(ctx->program, ctx->block);
 412    Operand shift;
 413    Temp select = Temp();
 414    if (offset.isConstant()) {
 415       assert(offset.constantValue() && offset.constantValue() < 4);
 416       shift = Operand(offset.constantValue() * 8);
 417    } else {
 418       /* bit_offset = 8 * (offset & 0x3) */
 419       Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
 420       select = bld.tmp(s1);
 421       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
 422    }
 423
 424    if (vec.size() == 1) {
 425       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
 426    } else if (vec.size() == 2) {
 427       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
 428       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
 429       if (tmp == dst)
 430          emit_split_vector(ctx, dst, 2);
 431       else
 432          emit_extract_vector(ctx, tmp, 0, dst);
 433    } else if (vec.size() == 4) {
 434       Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
 435       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 436       hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
 437       if (select != Temp())
 438          hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
 439       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
 440       Temp mid = bld.tmp(s1);
 441       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
 442       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
 443       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
 444       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
 445       emit_split_vector(ctx, dst, 2);
 446    }
 447 }
 448
 449 void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 450 {
 451    Builder bld(ctx->program, ctx->block);
 452    if (offset.isTemp()) {
 453       Temp tmp[4] = {vec, vec, vec, vec};
 454
 455       if (vec.size() == 4) {
 456          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
 457          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
 458       } else if (vec.size() == 3) {
 459          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
 460          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
 461       } else if (vec.size() == 2) {
 462          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
 463          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
 464       }
 465       for (unsigned i = 0; i < dst.size(); i++)
 466          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
 467
 468       vec = tmp[0];
 469       if (dst.size() == 2)
 470          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
 471
 472       offset = Operand(0u);
 473    }
 474
 475    unsigned num_components = dst.bytes() / component_size;
 476    if (vec.regClass() == dst.regClass()) {
 477       assert(offset.constantValue() == 0);
 478       bld.copy(Definition(dst), vec);
 479       emit_split_vector(ctx, dst, num_components);
 480       return;
 481    }
 482
 483    emit_split_vector(ctx, vec, vec.bytes() / component_size);
 484    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 485    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
 486
 487    assert(offset.constantValue() % component_size == 0);
 488    unsigned skip = offset.constantValue() / component_size;
 489    for (unsigned i = 0; i < num_components; i++)
 490       elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
 491
 492    /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
 493    if (dst.type() == RegType::vgpr) {
 494       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 495       for (unsigned i = 0; i < num_components; i++)
 496          create_vec->operands[i] = Operand(elems[i]);
 497       create_vec->definitions[0] = Definition(dst);
 498       bld.insert(std::move(create_vec));
 499
 500    /* if dst is sgpr - split the src, but move the original to sgpr. */
 501    } else if (skip) {
 502       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
 503       byte_align_scalar(ctx, vec, offset, dst);
 504    } else {
 505       assert(dst.size() == vec.size());
 506       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
 507    }
 508
 509    ctx->allocated_vec.emplace(dst.id(), elems);
 510 }
 511
 512 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
 513 {
 514    Builder bld(ctx->program, ctx->block);
 515    if (!dst.id())
 516       dst = bld.tmp(bld.lm);
 517
 518    assert(val.regClass() == s1);
 519    assert(dst.regClass() == bld.lm);
 520
 521    return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
 522 }
 523
 524 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
 525 {
 526    Builder bld(ctx->program, ctx->block);
 527    if (!dst.id())
 528       dst = bld.tmp(s1);
 529
 530    assert(val.regClass() == bld.lm);
 531    assert(dst.regClass() == s1);
 532
 533    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 534    Temp tmp = bld.tmp(s1);
 535    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
 536    return emit_wqm(ctx, tmp, dst);
 537 }
 538
 539 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 540 {
 541    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 542       return get_ssa_temp(ctx, src.src.ssa);
 543
 544    if (src.src.ssa->num_components == size) {
 545       bool identity_swizzle = true;
 546       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 547          if (src.swizzle[i] != i)
 548             identity_swizzle = false;
 549       }
 550       if (identity_swizzle)
 551          return get_ssa_temp(ctx, src.src.ssa);
 552    }
 553
 554    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 555    unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
 556    assert(elem_size > 0);
 557    assert(vec.bytes() % elem_size == 0);
 558
 559    if (elem_size < 4 && vec.type() == RegType::sgpr) {
 560       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
 561       assert(size == 1);
 562       unsigned swizzle = src.swizzle[0];
 563       if (vec.size() > 1) {
 564          assert(src.src.ssa->bit_size == 16);
 565          vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
 566          swizzle = swizzle & 1;
 567       }
 568       if (swizzle == 0)
 569          return vec;
 570
 571       Temp dst{ctx->program->allocateId(), s1};
 572       aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
 573       bfe->operands[0] = Operand(vec);
 574       bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
 575       bfe->definitions[0] = Definition(dst);
 576       bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 577       ctx->block->instructions.emplace_back(std::move(bfe));
 578       return dst;
 579    }
 580
 581    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
 582    if (size == 1) {
 583       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 584    } else {
 585       assert(size <= 4);
 586       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 587       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 588       for (unsigned i = 0; i < size; ++i) {
 589          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 590          vec_instr->operands[i] = Operand{elems[i]};
 591       }
 592       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
 593       vec_instr->definitions[0] = Definition(dst);
 594       ctx->block->instructions.emplace_back(std::move(vec_instr));
 595       ctx->allocated_vec.emplace(dst.id(), elems);
 596       return dst;
 597    }
 598 }
 599
 600 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 601 {
 602    if (ptr.size() == 2)
 603       return ptr;
 604    Builder bld(ctx->program, ctx->block);
 605    if (ptr.type() == RegType::vgpr)
 606       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 607    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 608                      ptr, Operand((unsigned)ctx->options->address32_hi));
 609 }
 610
 611 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 612 {
 613    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 614    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 615    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 616    sop2->definitions[0] = Definition(dst);
 617    if (instr->no_unsigned_wrap)
 618       sop2->definitions[0].setNUW(true);
 619    if (writes_scc)
 620       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 621    ctx->block->instructions.emplace_back(std::move(sop2));
 622 }
 623
 624 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 625                            bool commutative, bool swap_srcs=false, bool flush_denorms = false)
 626 {
 627    Builder bld(ctx->program, ctx->block);
 628    bld.is_precise = instr->exact;
 629
 630    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 631    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 632    if (src1.type() == RegType::sgpr) {
 633       if (commutative && src0.type() == RegType::vgpr) {
 634          Temp t = src0;
 635          src0 = src1;
 636          src1 = t;
 637       } else {
 638          src1 = as_vgpr(ctx, src1);
 639       }
 640    }
 641
 642    if (flush_denorms && ctx->program->chip_class < GFX9) {
 643       assert(dst.size() == 1);
 644       Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
 645       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 646    } else {
 647       bld.vop2(op, Definition(dst), src0, src1);
 648    }
 649 }
 650
 651 void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
 652                                    aco_opcode op, Temp dst)
 653 {
 654    Builder bld(ctx->program, ctx->block);
 655    bld.is_precise = instr->exact;
 656
 657    Temp src0 = get_alu_src(ctx, instr->src[0]);
 658    Temp src1 = get_alu_src(ctx, instr->src[1]);
 659
 660    if (src1.type() == RegType::sgpr) {
 661       assert(src0.type() == RegType::vgpr);
 662       std::swap(src0, src1);
 663    }
 664
 665    Temp src00 = bld.tmp(src0.type(), 1);
 666    Temp src01 = bld.tmp(src0.type(), 1);
 667    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 668    Temp src10 = bld.tmp(v1);
 669    Temp src11 = bld.tmp(v1);
 670    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 671    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
 672    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
 673    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
 674 }
 675
 676 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 677                             bool flush_denorms = false)
 678 {
 679    Temp src0 = get_alu_src(ctx, instr->src[0]);
 680    Temp src1 = get_alu_src(ctx, instr->src[1]);
 681    Temp src2 = get_alu_src(ctx, instr->src[2]);
 682
 683    /* ensure that the instruction has at most 1 sgpr operand
 684     * The optimizer will inline constants for us */
 685    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 686       src0 = as_vgpr(ctx, src0);
 687    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 688       src1 = as_vgpr(ctx, src1);
 689    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 690       src2 = as_vgpr(ctx, src2);
 691
 692    Builder bld(ctx->program, ctx->block);
 693    bld.is_precise = instr->exact;
 694    if (flush_denorms && ctx->program->chip_class < GFX9) {
 695       assert(dst.size() == 1);
 696       Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
 697       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 698    } else {
 699       bld.vop3(op, Definition(dst), src0, src1, src2);
 700    }
 701 }
 702
 703 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 704 {
 705    Builder bld(ctx->program, ctx->block);
 706    bld.is_precise = instr->exact;
 707    if (dst.type() == RegType::sgpr)
 708       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
 709                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
 710    else
 711       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 712 }
 713
 714 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 715 {
 716    Temp src0 = get_alu_src(ctx, instr->src[0]);
 717    Temp src1 = get_alu_src(ctx, instr->src[1]);
 718    assert(src0.size() == src1.size());
 719
 720    aco_ptr<Instruction> vopc;
 721    if (src1.type() == RegType::sgpr) {
 722       if (src0.type() == RegType::vgpr) {
 723          /* to swap the operands, we might also have to change the opcode */
 724          switch (op) {
 725             case aco_opcode::v_cmp_lt_f16:
 726                op = aco_opcode::v_cmp_gt_f16;
 727                break;
 728             case aco_opcode::v_cmp_ge_f16:
 729                op = aco_opcode::v_cmp_le_f16;
 730                break;
 731             case aco_opcode::v_cmp_lt_i16:
 732                op = aco_opcode::v_cmp_gt_i16;
 733                break;
 734             case aco_opcode::v_cmp_ge_i16:
 735                op = aco_opcode::v_cmp_le_i16;
 736                break;
 737             case aco_opcode::v_cmp_lt_u16:
 738                op = aco_opcode::v_cmp_gt_u16;
 739                break;
 740             case aco_opcode::v_cmp_ge_u16:
 741                op = aco_opcode::v_cmp_le_u16;
 742                break;
 743             case aco_opcode::v_cmp_lt_f32:
 744                op = aco_opcode::v_cmp_gt_f32;
 745                break;
 746             case aco_opcode::v_cmp_ge_f32:
 747                op = aco_opcode::v_cmp_le_f32;
 748                break;
 749             case aco_opcode::v_cmp_lt_i32:
 750                op = aco_opcode::v_cmp_gt_i32;
 751                break;
 752             case aco_opcode::v_cmp_ge_i32:
 753                op = aco_opcode::v_cmp_le_i32;
 754                break;
 755             case aco_opcode::v_cmp_lt_u32:
 756                op = aco_opcode::v_cmp_gt_u32;
 757                break;
 758             case aco_opcode::v_cmp_ge_u32:
 759                op = aco_opcode::v_cmp_le_u32;
 760                break;
 761             case aco_opcode::v_cmp_lt_f64:
 762                op = aco_opcode::v_cmp_gt_f64;
 763                break;
 764             case aco_opcode::v_cmp_ge_f64:
 765                op = aco_opcode::v_cmp_le_f64;
 766                break;
 767             case aco_opcode::v_cmp_lt_i64:
 768                op = aco_opcode::v_cmp_gt_i64;
 769                break;
 770             case aco_opcode::v_cmp_ge_i64:
 771                op = aco_opcode::v_cmp_le_i64;
 772                break;
 773             case aco_opcode::v_cmp_lt_u64:
 774                op = aco_opcode::v_cmp_gt_u64;
 775                break;
 776             case aco_opcode::v_cmp_ge_u64:
 777                op = aco_opcode::v_cmp_le_u64;
 778                break;
 779             default: /* eq and ne are commutative */
 780                break;
 781          }
 782          Temp t = src0;
 783          src0 = src1;
 784          src1 = t;
 785       } else {
 786          src1 = as_vgpr(ctx, src1);
 787       }
 788    }
 789
 790    Builder bld(ctx->program, ctx->block);
 791    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
 792 }
 793
 794 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 795 {
 796    Temp src0 = get_alu_src(ctx, instr->src[0]);
 797    Temp src1 = get_alu_src(ctx, instr->src[1]);
 798    Builder bld(ctx->program, ctx->block);
 799
 800    assert(dst.regClass() == bld.lm);
 801    assert(src0.type() == RegType::sgpr);
 802    assert(src1.type() == RegType::sgpr);
 803    assert(src0.regClass() == src1.regClass());
 804
 805    /* Emit the SALU comparison instruction */
 806    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
 807    /* Turn the result into a per-lane bool */
 808    bool_to_vector_condition(ctx, cmp, dst);
 809 }
 810
 811 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
 812                      aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
 813 {
 814    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
 815    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
 816    bool use_valu = s_op == aco_opcode::num_opcodes ||
 817                    nir_dest_is_divergent(instr->dest.dest) ||
 818                    ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
 819                    ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
 820    aco_opcode op = use_valu ? v_op : s_op;
 821    assert(op != aco_opcode::num_opcodes);
 822    assert(dst.regClass() == ctx->program->lane_mask);
 823
 824    if (use_valu)
 825       emit_vopc_instruction(ctx, instr, op, dst);
 826    else
 827       emit_sopc_instruction(ctx, instr, op, dst);
 828 }
 829
 830 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
 831 {
 832    Builder bld(ctx->program, ctx->block);
 833    Temp src0 = get_alu_src(ctx, instr->src[0]);
 834    Temp src1 = get_alu_src(ctx, instr->src[1]);
 835
 836    assert(dst.regClass() == bld.lm);
 837    assert(src0.regClass() == bld.lm);
 838    assert(src1.regClass() == bld.lm);
 839
 840    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
 841 }
 842
 843 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 844 {
 845    Builder bld(ctx->program, ctx->block);
 846    Temp cond = get_alu_src(ctx, instr->src[0]);
 847    Temp then = get_alu_src(ctx, instr->src[1]);
 848    Temp els = get_alu_src(ctx, instr->src[2]);
 849
 850    assert(cond.regClass() == bld.lm);
 851
 852    if (dst.type() == RegType::vgpr) {
 853       aco_ptr<Instruction> bcsel;
 854       if (dst.size() == 1) {
 855          then = as_vgpr(ctx, then);
 856          els = as_vgpr(ctx, els);
 857
 858          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 859       } else if (dst.size() == 2) {
 860          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 861          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 862          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 863          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 864
 865          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 866          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 867
 868          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 869       } else {
 870          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 871          nir_print_instr(&instr->instr, stderr);
 872          fprintf(stderr, "\n");
 873       }
 874       return;
 875    }
 876
 877    if (instr->dest.dest.ssa.bit_size == 1) {
 878       assert(dst.regClass() == bld.lm);
 879       assert(then.regClass() == bld.lm);
 880       assert(els.regClass() == bld.lm);
 881    }
 882
 883    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
 884       if (dst.regClass() == s1 || dst.regClass() == s2) {
 885          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 886          assert(dst.size() == then.size());
 887          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 888          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
 889       } else {
 890          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 891          nir_print_instr(&instr->instr, stderr);
 892          fprintf(stderr, "\n");
 893       }
 894       return;
 895    }
 896
 897    /* divergent boolean bcsel
 898     * this implements bcsel on bools: dst = s0 ? s1 : s2
 899     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 900    assert(instr->dest.dest.ssa.bit_size == 1);
 901
 902    if (cond.id() != then.id())
 903       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
 904
 905    if (cond.id() == els.id())
 906       bld.sop1(Builder::s_mov, Definition(dst), then);
 907    else
 908       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
 909                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
 910 }
 911
 912 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
 913                     aco_opcode op, uint32_t undo)
 914 {
 915    /* multiply by 16777216 to handle denormals */
 916    Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
 917                                as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
 918    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
 919    scaled = bld.vop1(op, bld.def(v1), scaled);
 920    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
 921
 922    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
 923
 924    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
 925 }
 926
 927 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 928 {
 929    if (ctx->block->fp_mode.denorm32 == 0) {
 930       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
 931       return;
 932    }
 933
 934    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
 935 }
 936
 937 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 938 {
 939    if (ctx->block->fp_mode.denorm32 == 0) {
 940       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
 941       return;
 942    }
 943
 944    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
 945 }
 946
 947 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 948 {
 949    if (ctx->block->fp_mode.denorm32 == 0) {
 950       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
 951       return;
 952    }
 953
 954    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
 955 }
 956
 957 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 958 {
 959    if (ctx->block->fp_mode.denorm32 == 0) {
 960       bld.vop1(aco_opcode::v_log_f32, dst, val);
 961       return;
 962    }
 963
 964    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
 965 }
 966
 967 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 968 {
 969    if (ctx->options->chip_class >= GFX7)
 970       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
 971
 972    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
 973    /* TODO: create more efficient code! */
 974    if (val.type() == RegType::sgpr)
 975       val = as_vgpr(ctx, val);
 976
 977    /* Split the input value. */
 978    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
 979    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
 980
 981    /* Extract the exponent and compute the unbiased value. */
 982    Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
 983    exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
 984
 985    /* Extract the fractional part. */
 986    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
 987    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
 988
 989    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
 990    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
 991
 992    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
 993    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
 994    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
 995    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
 996    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
 997
 998    /* Get the sign bit. */
 999    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
1000
1001    /* Decide the operation to apply depending on the unbiased exponent. */
1002    Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
1003    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
1004    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1005    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
1006    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1007    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1008
1009    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1010 }
1011
1012 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1013 {
1014    if (ctx->options->chip_class >= GFX7)
1015       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1016
1017    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1018     * lowered at NIR level for precision reasons). */
1019    Temp src0 = as_vgpr(ctx, val);
1020
1021    Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
1022    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
1023
1024    Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1025    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1026    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1027
1028    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1029    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1030    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1031    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1032
1033    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1034    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1035
1036    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1037
1038    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1039    static_cast<VOP3A_instruction*>(add)->neg[1] = true;
1040
1041    return add->definitions[0].getTemp();
1042 }
1043
1044 Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
1045    if (!dst.id()) {
1046       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
1047          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
1048       else
1049          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
1050    }
1051
1052    if (dst.bytes() == src.bytes() && dst_bits < src_bits)
1053       return bld.copy(Definition(dst), src);
1054    else if (dst.bytes() < src.bytes())
1055       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
1056
1057    Temp tmp = dst;
1058    if (dst_bits == 64)
1059       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
1060
1061    if (tmp == src) {
1062    } else if (src.regClass() == s1) {
1063       if (is_signed)
1064          bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
1065       else
1066          bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
1067    } else if (ctx->options->chip_class >= GFX8) {
1068       assert(src_bits != 8 || src.regClass() == v1b);
1069       assert(src_bits != 16 || src.regClass() == v2b);
1070       aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
1071       sdwa->operands[0] = Operand(src);
1072       sdwa->definitions[0] = Definition(tmp);
1073       if (is_signed)
1074          sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
1075       else
1076          sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
1077       sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
1078       bld.insert(std::move(sdwa));
1079    } else {
1080       assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
1081       aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
1082       bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
1083    }
1084
1085    if (dst_bits == 64) {
1086       if (is_signed && dst.regClass() == s2) {
1087          Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
1088          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1089       } else if (is_signed && dst.regClass() == v2) {
1090          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
1091          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1092       } else {
1093          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
1094       }
1095    }
1096
1097    return dst;
1098 }
1099
1100 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
1101 {
1102    if (!instr->dest.dest.is_ssa) {
1103       fprintf(stderr, "nir alu dst not in ssa: ");
1104       nir_print_instr(&instr->instr, stderr);
1105       fprintf(stderr, "\n");
1106       abort();
1107    }
1108    Builder bld(ctx->program, ctx->block);
1109    bld.is_precise = instr->exact;
1110    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1111    switch(instr->op) {
1112    case nir_op_vec2:
1113    case nir_op_vec3:
1114    case nir_op_vec4: {
1115       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1116       unsigned num = instr->dest.dest.ssa.num_components;
1117       for (unsigned i = 0; i < num; ++i)
1118          elems[i] = get_alu_src(ctx, instr->src[i]);
1119
1120       if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1121          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1122          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1123          for (unsigned i = 0; i < num; ++i) {
1124             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1125                vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
1126             else
1127                vec->operands[i] = Operand{elems[i]};
1128          }
1129          vec->definitions[0] = Definition(dst);
1130          ctx->block->instructions.emplace_back(std::move(vec));
1131          ctx->allocated_vec.emplace(dst.id(), elems);
1132       } else {
1133          // TODO: that is a bit suboptimal..
1134          Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1135          for (unsigned i = 0; i < num - 1; ++i)
1136             if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
1137                elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1138          for (unsigned i = 0; i < num; ++i) {
1139             unsigned bit = i * instr->dest.dest.ssa.bit_size;
1140             if (bit % 32 == 0) {
1141                elems[bit / 32] = elems[i];
1142             } else {
1143                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1144                                    elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
1145                elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
1146             }
1147          }
1148          if (dst.size() == 1)
1149             bld.copy(Definition(dst), elems[0]);
1150          else
1151             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
1152       }
1153       break;
1154    }
1155    case nir_op_mov: {
1156       Temp src = get_alu_src(ctx, instr->src[0]);
1157       aco_ptr<Instruction> mov;
1158       if (dst.type() == RegType::sgpr) {
1159          if (src.type() == RegType::vgpr)
1160             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1161          else if (src.regClass() == s1)
1162             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
1163          else if (src.regClass() == s2)
1164             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
1165          else
1166             unreachable("wrong src register class for nir_op_imov");
1167       } else {
1168          if (dst.regClass() == v1)
1169             bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
1170          else if (dst.regClass() == v1b ||
1171                   dst.regClass() == v2b ||
1172                   dst.regClass() == v2)
1173             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
1174          else
1175             unreachable("wrong src register class for nir_op_imov");
1176       }
1177       break;
1178    }
1179    case nir_op_inot: {
1180       Temp src = get_alu_src(ctx, instr->src[0]);
1181       if (instr->dest.dest.ssa.bit_size == 1) {
1182          assert(src.regClass() == bld.lm);
1183          assert(dst.regClass() == bld.lm);
1184          /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1185          Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1186          bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1187       } else if (dst.regClass() == v1) {
1188          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1189       } else if (dst.regClass() == v2) {
1190          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1191          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1192          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1193          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1194          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1195       } else if (dst.type() == RegType::sgpr) {
1196          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1197          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1198       } else {
1199          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1200          nir_print_instr(&instr->instr, stderr);
1201          fprintf(stderr, "\n");
1202       }
1203       break;
1204    }
1205    case nir_op_ineg: {
1206       Temp src = get_alu_src(ctx, instr->src[0]);
1207       if (dst.regClass() == v1) {
1208          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1209       } else if (dst.regClass() == s1) {
1210          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1211       } else if (dst.size() == 2) {
1212          Temp src0 = bld.tmp(dst.type(), 1);
1213          Temp src1 = bld.tmp(dst.type(), 1);
1214          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1215
1216          if (dst.regClass() == s2) {
1217             Temp carry = bld.tmp(s1);
1218             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1219             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1220             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1221          } else {
1222             Temp lower = bld.tmp(v1);
1223             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1224             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1225             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1226          }
1227       } else {
1228          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1229          nir_print_instr(&instr->instr, stderr);
1230          fprintf(stderr, "\n");
1231       }
1232       break;
1233    }
1234    case nir_op_iabs: {
1235       if (dst.regClass() == s1) {
1236          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1237       } else if (dst.regClass() == v1) {
1238          Temp src = get_alu_src(ctx, instr->src[0]);
1239          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1240       } else {
1241          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1242          nir_print_instr(&instr->instr, stderr);
1243          fprintf(stderr, "\n");
1244       }
1245       break;
1246    }
1247    case nir_op_isign: {
1248       Temp src = get_alu_src(ctx, instr->src[0]);
1249       if (dst.regClass() == s1) {
1250          Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
1251          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1252       } else if (dst.regClass() == s2) {
1253          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1254          Temp neqz;
1255          if (ctx->program->chip_class >= GFX8)
1256             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1257          else
1258             neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1259          /* SCC gets zero-extended to 64 bit */
1260          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1261       } else if (dst.regClass() == v1) {
1262          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1263       } else if (dst.regClass() == v2) {
1264          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1265          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1266          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1267          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1268          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1269          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1270       } else {
1271          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1272          nir_print_instr(&instr->instr, stderr);
1273          fprintf(stderr, "\n");
1274       }
1275       break;
1276    }
1277    case nir_op_imax: {
1278       if (dst.regClass() == v1) {
1279          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1280       } else if (dst.regClass() == s1) {
1281          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1282       } else {
1283          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1284          nir_print_instr(&instr->instr, stderr);
1285          fprintf(stderr, "\n");
1286       }
1287       break;
1288    }
1289    case nir_op_umax: {
1290       if (dst.regClass() == v1) {
1291          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1292       } else if (dst.regClass() == s1) {
1293          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1294       } else {
1295          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1296          nir_print_instr(&instr->instr, stderr);
1297          fprintf(stderr, "\n");
1298       }
1299       break;
1300    }
1301    case nir_op_imin: {
1302       if (dst.regClass() == v1) {
1303          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1304       } else if (dst.regClass() == s1) {
1305          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1306       } else {
1307          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1308          nir_print_instr(&instr->instr, stderr);
1309          fprintf(stderr, "\n");
1310       }
1311       break;
1312    }
1313    case nir_op_umin: {
1314       if (dst.regClass() == v1) {
1315          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1316       } else if (dst.regClass() == s1) {
1317          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1318       } else {
1319          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1320          nir_print_instr(&instr->instr, stderr);
1321          fprintf(stderr, "\n");
1322       }
1323       break;
1324    }
1325    case nir_op_ior: {
1326       if (instr->dest.dest.ssa.bit_size == 1) {
1327          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1328       } else if (dst.regClass() == v1) {
1329          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1330       } else if (dst.regClass() == v2) {
1331          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1332       } else if (dst.regClass() == s1) {
1333          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1334       } else if (dst.regClass() == s2) {
1335          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1336       } else {
1337          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1338          nir_print_instr(&instr->instr, stderr);
1339          fprintf(stderr, "\n");
1340       }
1341       break;
1342    }
1343    case nir_op_iand: {
1344       if (instr->dest.dest.ssa.bit_size == 1) {
1345          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1346       } else if (dst.regClass() == v1) {
1347          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1348       } else if (dst.regClass() == v2) {
1349          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1350       } else if (dst.regClass() == s1) {
1351          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1352       } else if (dst.regClass() == s2) {
1353          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1354       } else {
1355          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1356          nir_print_instr(&instr->instr, stderr);
1357          fprintf(stderr, "\n");
1358       }
1359       break;
1360    }
1361    case nir_op_ixor: {
1362       if (instr->dest.dest.ssa.bit_size == 1) {
1363          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1364       } else if (dst.regClass() == v1) {
1365          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1366       } else if (dst.regClass() == v2) {
1367          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1368       } else if (dst.regClass() == s1) {
1369          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1370       } else if (dst.regClass() == s2) {
1371          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1372       } else {
1373          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1374          nir_print_instr(&instr->instr, stderr);
1375          fprintf(stderr, "\n");
1376       }
1377       break;
1378    }
1379    case nir_op_ushr: {
1380       if (dst.regClass() == v1) {
1381          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1382       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1383          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1384                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1385       } else if (dst.regClass() == v2) {
1386          bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1387                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1388       } else if (dst.regClass() == s2) {
1389          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1390       } else if (dst.regClass() == s1) {
1391          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1392       } else {
1393          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1394          nir_print_instr(&instr->instr, stderr);
1395          fprintf(stderr, "\n");
1396       }
1397       break;
1398    }
1399    case nir_op_ishl: {
1400       if (dst.regClass() == v1) {
1401          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1402       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1403          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1404                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1405       } else if (dst.regClass() == v2) {
1406          bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1407                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1408       } else if (dst.regClass() == s1) {
1409          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1410       } else if (dst.regClass() == s2) {
1411          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1412       } else {
1413          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1414          nir_print_instr(&instr->instr, stderr);
1415          fprintf(stderr, "\n");
1416       }
1417       break;
1418    }
1419    case nir_op_ishr: {
1420       if (dst.regClass() == v1) {
1421          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1422       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1423          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1424                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1425       } else if (dst.regClass() == v2) {
1426          bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1427                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1428       } else if (dst.regClass() == s1) {
1429          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1430       } else if (dst.regClass() == s2) {
1431          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1432       } else {
1433          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1434          nir_print_instr(&instr->instr, stderr);
1435          fprintf(stderr, "\n");
1436       }
1437       break;
1438    }
1439    case nir_op_find_lsb: {
1440       Temp src = get_alu_src(ctx, instr->src[0]);
1441       if (src.regClass() == s1) {
1442          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1443       } else if (src.regClass() == v1) {
1444          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1445       } else if (src.regClass() == s2) {
1446          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1447       } else {
1448          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1449          nir_print_instr(&instr->instr, stderr);
1450          fprintf(stderr, "\n");
1451       }
1452       break;
1453    }
1454    case nir_op_ufind_msb:
1455    case nir_op_ifind_msb: {
1456       Temp src = get_alu_src(ctx, instr->src[0]);
1457       if (src.regClass() == s1 || src.regClass() == s2) {
1458          aco_opcode op = src.regClass() == s2 ?
1459                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1460                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1461          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1462
1463          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1464                                         Operand(src.size() * 32u - 1u), msb_rev);
1465          Temp msb = sub.def(0).getTemp();
1466          Temp carry = sub.def(1).getTemp();
1467
1468          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1469       } else if (src.regClass() == v1) {
1470          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1471          Temp msb_rev = bld.tmp(v1);
1472          emit_vop1_instruction(ctx, instr, op, msb_rev);
1473          Temp msb = bld.tmp(v1);
1474          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1475          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1476       } else {
1477          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1478          nir_print_instr(&instr->instr, stderr);
1479          fprintf(stderr, "\n");
1480       }
1481       break;
1482    }
1483    case nir_op_bitfield_reverse: {
1484       if (dst.regClass() == s1) {
1485          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1486       } else if (dst.regClass() == v1) {
1487          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1488       } else {
1489          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1490          nir_print_instr(&instr->instr, stderr);
1491          fprintf(stderr, "\n");
1492       }
1493       break;
1494    }
1495    case nir_op_iadd: {
1496       if (dst.regClass() == s1) {
1497          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1498          break;
1499       }
1500
1501       Temp src0 = get_alu_src(ctx, instr->src[0]);
1502       Temp src1 = get_alu_src(ctx, instr->src[1]);
1503       if (dst.regClass() == v1) {
1504          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1505          break;
1506       }
1507
1508       assert(src0.size() == 2 && src1.size() == 2);
1509       Temp src00 = bld.tmp(src0.type(), 1);
1510       Temp src01 = bld.tmp(dst.type(), 1);
1511       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1512       Temp src10 = bld.tmp(src1.type(), 1);
1513       Temp src11 = bld.tmp(dst.type(), 1);
1514       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1515
1516       if (dst.regClass() == s2) {
1517          Temp carry = bld.tmp(s1);
1518          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1519          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1520          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1521       } else if (dst.regClass() == v2) {
1522          Temp dst0 = bld.tmp(v1);
1523          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1524          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1525          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1526       } else {
1527          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1528          nir_print_instr(&instr->instr, stderr);
1529          fprintf(stderr, "\n");
1530       }
1531       break;
1532    }
1533    case nir_op_uadd_sat: {
1534       Temp src0 = get_alu_src(ctx, instr->src[0]);
1535       Temp src1 = get_alu_src(ctx, instr->src[1]);
1536       if (dst.regClass() == s1) {
1537          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1538          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1539                   src0, src1);
1540          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1541       } else if (dst.regClass() == v1) {
1542          if (ctx->options->chip_class >= GFX9) {
1543             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1544             add->operands[0] = Operand(src0);
1545             add->operands[1] = Operand(src1);
1546             add->definitions[0] = Definition(dst);
1547             add->clamp = 1;
1548             ctx->block->instructions.emplace_back(std::move(add));
1549          } else {
1550             if (src1.regClass() != v1)
1551                std::swap(src0, src1);
1552             assert(src1.regClass() == v1);
1553             Temp tmp = bld.tmp(v1);
1554             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1555             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1556          }
1557       } else {
1558          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1559          nir_print_instr(&instr->instr, stderr);
1560          fprintf(stderr, "\n");
1561       }
1562       break;
1563    }
1564    case nir_op_uadd_carry: {
1565       Temp src0 = get_alu_src(ctx, instr->src[0]);
1566       Temp src1 = get_alu_src(ctx, instr->src[1]);
1567       if (dst.regClass() == s1) {
1568          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1569          break;
1570       }
1571       if (dst.regClass() == v1) {
1572          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1573          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1574          break;
1575       }
1576
1577       Temp src00 = bld.tmp(src0.type(), 1);
1578       Temp src01 = bld.tmp(dst.type(), 1);
1579       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1580       Temp src10 = bld.tmp(src1.type(), 1);
1581       Temp src11 = bld.tmp(dst.type(), 1);
1582       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1583       if (dst.regClass() == s2) {
1584          Temp carry = bld.tmp(s1);
1585          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1586          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1587          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1588       } else if (dst.regClass() == v2) {
1589          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1590          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1591          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1592          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1593       } else {
1594          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1595          nir_print_instr(&instr->instr, stderr);
1596          fprintf(stderr, "\n");
1597       }
1598       break;
1599    }
1600    case nir_op_isub: {
1601       if (dst.regClass() == s1) {
1602          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1603          break;
1604       }
1605
1606       Temp src0 = get_alu_src(ctx, instr->src[0]);
1607       Temp src1 = get_alu_src(ctx, instr->src[1]);
1608       if (dst.regClass() == v1) {
1609          bld.vsub32(Definition(dst), src0, src1);
1610          break;
1611       }
1612
1613       Temp src00 = bld.tmp(src0.type(), 1);
1614       Temp src01 = bld.tmp(dst.type(), 1);
1615       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1616       Temp src10 = bld.tmp(src1.type(), 1);
1617       Temp src11 = bld.tmp(dst.type(), 1);
1618       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1619       if (dst.regClass() == s2) {
1620          Temp carry = bld.tmp(s1);
1621          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1622          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1623          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1624       } else if (dst.regClass() == v2) {
1625          Temp lower = bld.tmp(v1);
1626          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1627          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1628          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1629       } else {
1630          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1631          nir_print_instr(&instr->instr, stderr);
1632          fprintf(stderr, "\n");
1633       }
1634       break;
1635    }
1636    case nir_op_usub_borrow: {
1637       Temp src0 = get_alu_src(ctx, instr->src[0]);
1638       Temp src1 = get_alu_src(ctx, instr->src[1]);
1639       if (dst.regClass() == s1) {
1640          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1641          break;
1642       } else if (dst.regClass() == v1) {
1643          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1644          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1645          break;
1646       }
1647
1648       Temp src00 = bld.tmp(src0.type(), 1);
1649       Temp src01 = bld.tmp(dst.type(), 1);
1650       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1651       Temp src10 = bld.tmp(src1.type(), 1);
1652       Temp src11 = bld.tmp(dst.type(), 1);
1653       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1654       if (dst.regClass() == s2) {
1655          Temp borrow = bld.tmp(s1);
1656          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1657          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1658          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1659       } else if (dst.regClass() == v2) {
1660          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1661          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1662          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1663          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1664       } else {
1665          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1666          nir_print_instr(&instr->instr, stderr);
1667          fprintf(stderr, "\n");
1668       }
1669       break;
1670    }
1671    case nir_op_imul: {
1672       if (dst.regClass() == v1) {
1673          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1674                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1675       } else if (dst.regClass() == s1) {
1676          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1677       } else {
1678          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1679          nir_print_instr(&instr->instr, stderr);
1680          fprintf(stderr, "\n");
1681       }
1682       break;
1683    }
1684    case nir_op_umul_high: {
1685       if (dst.regClass() == v1) {
1686          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1687       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1688          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1689       } else if (dst.regClass() == s1) {
1690          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1691                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1692          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1693       } else {
1694          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1695          nir_print_instr(&instr->instr, stderr);
1696          fprintf(stderr, "\n");
1697       }
1698       break;
1699    }
1700    case nir_op_imul_high: {
1701       if (dst.regClass() == v1) {
1702          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1703       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1704          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1705       } else if (dst.regClass() == s1) {
1706          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1707                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1708          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1709       } else {
1710          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1711          nir_print_instr(&instr->instr, stderr);
1712          fprintf(stderr, "\n");
1713       }
1714       break;
1715    }
1716    case nir_op_fmul: {
1717       Temp src0 = get_alu_src(ctx, instr->src[0]);
1718       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1719       if (dst.regClass() == v2b) {
1720          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
1721       } else if (dst.regClass() == v1) {
1722          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1723       } else if (dst.regClass() == v2) {
1724          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1725       } else {
1726          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1727          nir_print_instr(&instr->instr, stderr);
1728          fprintf(stderr, "\n");
1729       }
1730       break;
1731    }
1732    case nir_op_fadd: {
1733       Temp src0 = get_alu_src(ctx, instr->src[0]);
1734       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1735       if (dst.regClass() == v2b) {
1736          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
1737       } else if (dst.regClass() == v1) {
1738          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1739       } else if (dst.regClass() == v2) {
1740          bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1741       } else {
1742          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1743          nir_print_instr(&instr->instr, stderr);
1744          fprintf(stderr, "\n");
1745       }
1746       break;
1747    }
1748    case nir_op_fsub: {
1749       Temp src0 = get_alu_src(ctx, instr->src[0]);
1750       Temp src1 = get_alu_src(ctx, instr->src[1]);
1751       if (dst.regClass() == v2b) {
1752          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1753             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
1754          else
1755             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
1756       } else if (dst.regClass() == v1) {
1757          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1758             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1759          else
1760             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1761       } else if (dst.regClass() == v2) {
1762          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1763                                      as_vgpr(ctx, src0), as_vgpr(ctx, src1));
1764          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1765          sub->neg[1] = true;
1766       } else {
1767          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1768          nir_print_instr(&instr->instr, stderr);
1769          fprintf(stderr, "\n");
1770       }
1771       break;
1772    }
1773    case nir_op_fmax: {
1774       Temp src0 = get_alu_src(ctx, instr->src[0]);
1775       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1776       if (dst.regClass() == v2b) {
1777          // TODO: check fp_mode.must_flush_denorms16_64
1778          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
1779       } else if (dst.regClass() == v1) {
1780          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1781       } else if (dst.regClass() == v2) {
1782          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1783             Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1784             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1785          } else {
1786             bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1787          }
1788       } else {
1789          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1790          nir_print_instr(&instr->instr, stderr);
1791          fprintf(stderr, "\n");
1792       }
1793       break;
1794    }
1795    case nir_op_fmin: {
1796       Temp src0 = get_alu_src(ctx, instr->src[0]);
1797       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1798       if (dst.regClass() == v2b) {
1799          // TODO: check fp_mode.must_flush_denorms16_64
1800          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
1801       } else if (dst.regClass() == v1) {
1802          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1803       } else if (dst.regClass() == v2) {
1804          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1805             Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1806             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1807          } else {
1808             bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1809          }
1810       } else {
1811          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1812          nir_print_instr(&instr->instr, stderr);
1813          fprintf(stderr, "\n");
1814       }
1815       break;
1816    }
1817    case nir_op_fmax3: {
1818       if (dst.regClass() == v2b) {
1819          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
1820       } else if (dst.regClass() == v1) {
1821          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1822       } else {
1823          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1824          nir_print_instr(&instr->instr, stderr);
1825          fprintf(stderr, "\n");
1826       }
1827       break;
1828    }
1829    case nir_op_fmin3: {
1830       if (dst.regClass() == v2b) {
1831          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
1832       } else if (dst.regClass() == v1) {
1833          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1834       } else {
1835          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1836          nir_print_instr(&instr->instr, stderr);
1837          fprintf(stderr, "\n");
1838       }
1839       break;
1840    }
1841    case nir_op_fmed3: {
1842       if (dst.regClass() == v2b) {
1843          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
1844       } else if (dst.regClass() == v1) {
1845          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1846       } else {
1847          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1848          nir_print_instr(&instr->instr, stderr);
1849          fprintf(stderr, "\n");
1850       }
1851       break;
1852    }
1853    case nir_op_umax3: {
1854       if (dst.size() == 1) {
1855          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1856       } else {
1857          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1858          nir_print_instr(&instr->instr, stderr);
1859          fprintf(stderr, "\n");
1860       }
1861       break;
1862    }
1863    case nir_op_umin3: {
1864       if (dst.size() == 1) {
1865          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1866       } else {
1867          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1868          nir_print_instr(&instr->instr, stderr);
1869          fprintf(stderr, "\n");
1870       }
1871       break;
1872    }
1873    case nir_op_umed3: {
1874       if (dst.size() == 1) {
1875          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1876       } else {
1877          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1878          nir_print_instr(&instr->instr, stderr);
1879          fprintf(stderr, "\n");
1880       }
1881       break;
1882    }
1883    case nir_op_imax3: {
1884       if (dst.size() == 1) {
1885          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1886       } else {
1887          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1888          nir_print_instr(&instr->instr, stderr);
1889          fprintf(stderr, "\n");
1890       }
1891       break;
1892    }
1893    case nir_op_imin3: {
1894       if (dst.size() == 1) {
1895          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1896       } else {
1897          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1898          nir_print_instr(&instr->instr, stderr);
1899          fprintf(stderr, "\n");
1900       }
1901       break;
1902    }
1903    case nir_op_imed3: {
1904       if (dst.size() == 1) {
1905          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1906       } else {
1907          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1908          nir_print_instr(&instr->instr, stderr);
1909          fprintf(stderr, "\n");
1910       }
1911       break;
1912    }
1913    case nir_op_cube_face_coord: {
1914       Temp in = get_alu_src(ctx, instr->src[0], 3);
1915       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1916                       emit_extract_vector(ctx, in, 1, v1),
1917                       emit_extract_vector(ctx, in, 2, v1) };
1918       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1919       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1920       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1921       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1922       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1923       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1924       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1925       break;
1926    }
1927    case nir_op_cube_face_index: {
1928       Temp in = get_alu_src(ctx, instr->src[0], 3);
1929       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1930                       emit_extract_vector(ctx, in, 1, v1),
1931                       emit_extract_vector(ctx, in, 2, v1) };
1932       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1933       break;
1934    }
1935    case nir_op_bcsel: {
1936       emit_bcsel(ctx, instr, dst);
1937       break;
1938    }
1939    case nir_op_frsq: {
1940       Temp src = get_alu_src(ctx, instr->src[0]);
1941       if (dst.regClass() == v2b) {
1942          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
1943       } else if (dst.regClass() == v1) {
1944          emit_rsq(ctx, bld, Definition(dst), src);
1945       } else if (dst.regClass() == v2) {
1946          /* Lowered at NIR level for precision reasons. */
1947          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1948       } else {
1949          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1950          nir_print_instr(&instr->instr, stderr);
1951          fprintf(stderr, "\n");
1952       }
1953       break;
1954    }
1955    case nir_op_fneg: {
1956       Temp src = get_alu_src(ctx, instr->src[0]);
1957       if (dst.regClass() == v2b) {
1958          if (ctx->block->fp_mode.must_flush_denorms16_64)
1959             src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1960          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
1961       } else if (dst.regClass() == v1) {
1962          if (ctx->block->fp_mode.must_flush_denorms32)
1963             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1964          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1965       } else if (dst.regClass() == v2) {
1966          if (ctx->block->fp_mode.must_flush_denorms16_64)
1967             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1968          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1969          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1970          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1971          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1972       } else {
1973          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1974          nir_print_instr(&instr->instr, stderr);
1975          fprintf(stderr, "\n");
1976       }
1977       break;
1978    }
1979    case nir_op_fabs: {
1980       Temp src = get_alu_src(ctx, instr->src[0]);
1981       if (dst.regClass() == v2b) {
1982          if (ctx->block->fp_mode.must_flush_denorms16_64)
1983             src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1984          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
1985       } else if (dst.regClass() == v1) {
1986          if (ctx->block->fp_mode.must_flush_denorms32)
1987             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1988          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1989       } else if (dst.regClass() == v2) {
1990          if (ctx->block->fp_mode.must_flush_denorms16_64)
1991             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1992          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1993          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1994          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1995          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1996       } else {
1997          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1998          nir_print_instr(&instr->instr, stderr);
1999          fprintf(stderr, "\n");
2000       }
2001       break;
2002    }
2003    case nir_op_fsat: {
2004       Temp src = get_alu_src(ctx, instr->src[0]);
2005       if (dst.regClass() == v2b) {
2006          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
2007       } else if (dst.regClass() == v1) {
2008          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2009          /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
2010          // TODO: confirm that this holds under any circumstances
2011       } else if (dst.regClass() == v2) {
2012          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
2013          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
2014          vop3->clamp = true;
2015       } else {
2016          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2017          nir_print_instr(&instr->instr, stderr);
2018          fprintf(stderr, "\n");
2019       }
2020       break;
2021    }
2022    case nir_op_flog2: {
2023       Temp src = get_alu_src(ctx, instr->src[0]);
2024       if (dst.regClass() == v2b) {
2025          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2026       } else if (dst.regClass() == v1) {
2027          emit_log2(ctx, bld, Definition(dst), src);
2028       } else {
2029          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2030          nir_print_instr(&instr->instr, stderr);
2031          fprintf(stderr, "\n");
2032       }
2033       break;
2034    }
2035    case nir_op_frcp: {
2036       Temp src = get_alu_src(ctx, instr->src[0]);
2037       if (dst.regClass() == v2b) {
2038          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2039       } else if (dst.regClass() == v1) {
2040          emit_rcp(ctx, bld, Definition(dst), src);
2041       } else if (dst.regClass() == v2) {
2042          /* Lowered at NIR level for precision reasons. */
2043          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2044       } else {
2045          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2046          nir_print_instr(&instr->instr, stderr);
2047          fprintf(stderr, "\n");
2048       }
2049       break;
2050    }
2051    case nir_op_fexp2: {
2052       if (dst.regClass() == v2b) {
2053          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2054       } else if (dst.regClass() == v1) {
2055          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2056       } else {
2057          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2058          nir_print_instr(&instr->instr, stderr);
2059          fprintf(stderr, "\n");
2060       }
2061       break;
2062    }
2063    case nir_op_fsqrt: {
2064       Temp src = get_alu_src(ctx, instr->src[0]);
2065       if (dst.regClass() == v2b) {
2066          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2067       } else if (dst.regClass() == v1) {
2068          emit_sqrt(ctx, bld, Definition(dst), src);
2069       } else if (dst.regClass() == v2) {
2070          /* Lowered at NIR level for precision reasons. */
2071          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2072       } else {
2073          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2074          nir_print_instr(&instr->instr, stderr);
2075          fprintf(stderr, "\n");
2076       }
2077       break;
2078    }
2079    case nir_op_ffract: {
2080       if (dst.regClass() == v2b) {
2081          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2082       } else if (dst.regClass() == v1) {
2083          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2084       } else if (dst.regClass() == v2) {
2085          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2086       } else {
2087          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2088          nir_print_instr(&instr->instr, stderr);
2089          fprintf(stderr, "\n");
2090       }
2091       break;
2092    }
2093    case nir_op_ffloor: {
2094       Temp src = get_alu_src(ctx, instr->src[0]);
2095       if (dst.regClass() == v2b) {
2096          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2097       } else if (dst.regClass() == v1) {
2098          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2099       } else if (dst.regClass() == v2) {
2100          emit_floor_f64(ctx, bld, Definition(dst), src);
2101       } else {
2102          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2103          nir_print_instr(&instr->instr, stderr);
2104          fprintf(stderr, "\n");
2105       }
2106       break;
2107    }
2108    case nir_op_fceil: {
2109       Temp src0 = get_alu_src(ctx, instr->src[0]);
2110       if (dst.regClass() == v2b) {
2111          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2112       } else if (dst.regClass() == v1) {
2113          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2114       } else if (dst.regClass() == v2) {
2115          if (ctx->options->chip_class >= GFX7) {
2116             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2117          } else {
2118             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2119             /* trunc = trunc(src0)
2120              * if (src0 > 0.0 && src0 != trunc)
2121              *    trunc += 1.0
2122              */
2123             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2124             Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
2125             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2126             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
2127             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
2128             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
2129             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2130          }
2131       } else {
2132          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2133          nir_print_instr(&instr->instr, stderr);
2134          fprintf(stderr, "\n");
2135       }
2136       break;
2137    }
2138    case nir_op_ftrunc: {
2139       Temp src = get_alu_src(ctx, instr->src[0]);
2140       if (dst.regClass() == v2b) {
2141          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2142       } else if (dst.regClass() == v1) {
2143          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2144       } else if (dst.regClass() == v2) {
2145          emit_trunc_f64(ctx, bld, Definition(dst), src);
2146       } else {
2147          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2148          nir_print_instr(&instr->instr, stderr);
2149          fprintf(stderr, "\n");
2150       }
2151       break;
2152    }
2153    case nir_op_fround_even: {
2154       Temp src0 = get_alu_src(ctx, instr->src[0]);
2155       if (dst.regClass() == v2b) {
2156          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2157       } else if (dst.regClass() == v1) {
2158          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2159       } else if (dst.regClass() == v2) {
2160          if (ctx->options->chip_class >= GFX7) {
2161             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2162          } else {
2163             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2164             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2165             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2166
2167             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
2168             Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
2169             Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2170             Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2171             static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
2172             tmp = sub->definitions[0].getTemp();
2173
2174             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2175             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2176             static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2177             Temp cond = vop3->definitions[0].getTemp();
2178
2179             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2180             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2181             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2182             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2183
2184             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2185          }
2186       } else {
2187          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2188          nir_print_instr(&instr->instr, stderr);
2189          fprintf(stderr, "\n");
2190       }
2191       break;
2192    }
2193    case nir_op_fsin:
2194    case nir_op_fcos: {
2195       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2196       aco_ptr<Instruction> norm;
2197       if (dst.regClass() == v2b) {
2198          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
2199          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2200          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2201          bld.vop1(opcode, Definition(dst), tmp);
2202       } else if (dst.regClass() == v1) {
2203          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2204          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2205
2206          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2207          if (ctx->options->chip_class < GFX9)
2208             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2209
2210          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2211          bld.vop1(opcode, Definition(dst), tmp);
2212       } else {
2213          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2214          nir_print_instr(&instr->instr, stderr);
2215          fprintf(stderr, "\n");
2216       }
2217       break;
2218    }
2219    case nir_op_ldexp: {
2220       Temp src0 = get_alu_src(ctx, instr->src[0]);
2221       Temp src1 = get_alu_src(ctx, instr->src[1]);
2222       if (dst.regClass() == v2b) {
2223          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2224       } else if (dst.regClass() == v1) {
2225          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
2226       } else if (dst.regClass() == v2) {
2227          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
2228       } else {
2229          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2230          nir_print_instr(&instr->instr, stderr);
2231          fprintf(stderr, "\n");
2232       }
2233       break;
2234    }
2235    case nir_op_frexp_sig: {
2236       Temp src = get_alu_src(ctx, instr->src[0]);
2237       if (dst.regClass() == v2b) {
2238          bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
2239       } else if (dst.regClass() == v1) {
2240          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2241       } else if (dst.regClass() == v2) {
2242          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2243       } else {
2244          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2245          nir_print_instr(&instr->instr, stderr);
2246          fprintf(stderr, "\n");
2247       }
2248       break;
2249    }
2250    case nir_op_frexp_exp: {
2251       Temp src = get_alu_src(ctx, instr->src[0]);
2252       if (instr->src[0].src.ssa->bit_size == 16) {
2253          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2254          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
2255          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2256       } else if (instr->src[0].src.ssa->bit_size == 32) {
2257          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2258       } else if (instr->src[0].src.ssa->bit_size == 64) {
2259          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2260       } else {
2261          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2262          nir_print_instr(&instr->instr, stderr);
2263          fprintf(stderr, "\n");
2264       }
2265       break;
2266    }
2267    case nir_op_fsign: {
2268       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2269       if (dst.regClass() == v2b) {
2270          Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2271          Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
2272          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2273          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
2274          cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2275          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
2276       } else if (dst.regClass() == v1) {
2277          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2278          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2279          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2280          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2281       } else if (dst.regClass() == v2) {
2282          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2283          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2284          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2285
2286          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2287          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2288          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2289
2290          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2291       } else {
2292          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2293          nir_print_instr(&instr->instr, stderr);
2294          fprintf(stderr, "\n");
2295       }
2296       break;
2297    }
2298    case nir_op_f2f16:
2299    case nir_op_f2f16_rtne: {
2300       Temp src = get_alu_src(ctx, instr->src[0]);
2301       if (instr->src[0].src.ssa->bit_size == 64)
2302          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2303       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2304          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2305           * keep value numbering and the scheduler simpler.
2306           */
2307          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2308       else
2309          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2310       break;
2311    }
2312    case nir_op_f2f16_rtz: {
2313       Temp src = get_alu_src(ctx, instr->src[0]);
2314       if (instr->src[0].src.ssa->bit_size == 64)
2315          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2316       bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
2317       break;
2318    }
2319    case nir_op_f2f32: {
2320       if (instr->src[0].src.ssa->bit_size == 16) {
2321          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2322       } else if (instr->src[0].src.ssa->bit_size == 64) {
2323          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2324       } else {
2325          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2326          nir_print_instr(&instr->instr, stderr);
2327          fprintf(stderr, "\n");
2328       }
2329       break;
2330    }
2331    case nir_op_f2f64: {
2332       Temp src = get_alu_src(ctx, instr->src[0]);
2333       if (instr->src[0].src.ssa->bit_size == 16)
2334          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2335       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2336       break;
2337    }
2338    case nir_op_i2f16: {
2339       assert(dst.regClass() == v2b);
2340       Temp src = get_alu_src(ctx, instr->src[0]);
2341       if (instr->src[0].src.ssa->bit_size == 8)
2342          src = convert_int(ctx, bld, src, 8, 16, true);
2343       else if (instr->src[0].src.ssa->bit_size == 64)
2344          src = convert_int(ctx, bld, src, 64, 32, false);
2345       bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2346       break;
2347    }
2348    case nir_op_i2f32: {
2349       assert(dst.size() == 1);
2350       Temp src = get_alu_src(ctx, instr->src[0]);
2351       if (instr->src[0].src.ssa->bit_size <= 16)
2352          src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2353       bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2354       break;
2355    }
2356    case nir_op_i2f64: {
2357       if (instr->src[0].src.ssa->bit_size <= 32) {
2358          Temp src = get_alu_src(ctx, instr->src[0]);
2359          if (instr->src[0].src.ssa->bit_size <= 16)
2360             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2361          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2362       } else if (instr->src[0].src.ssa->bit_size == 64) {
2363          Temp src = get_alu_src(ctx, instr->src[0]);
2364          RegClass rc = RegClass(src.type(), 1);
2365          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2366          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2367          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2368          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2369          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2370          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2371
2372       } else {
2373          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2374          nir_print_instr(&instr->instr, stderr);
2375          fprintf(stderr, "\n");
2376       }
2377       break;
2378    }
2379    case nir_op_u2f16: {
2380       assert(dst.regClass() == v2b);
2381       Temp src = get_alu_src(ctx, instr->src[0]);
2382       if (instr->src[0].src.ssa->bit_size == 8)
2383          src = convert_int(ctx, bld, src, 8, 16, false);
2384       else if (instr->src[0].src.ssa->bit_size == 64)
2385          src = convert_int(ctx, bld, src, 64, 32, false);
2386       bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2387       break;
2388    }
2389    case nir_op_u2f32: {
2390       assert(dst.size() == 1);
2391       Temp src = get_alu_src(ctx, instr->src[0]);
2392       if (instr->src[0].src.ssa->bit_size == 8) {
2393          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2394       } else {
2395          if (instr->src[0].src.ssa->bit_size == 16)
2396             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2397          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2398       }
2399       break;
2400    }
2401    case nir_op_u2f64: {
2402       if (instr->src[0].src.ssa->bit_size <= 32) {
2403          Temp src = get_alu_src(ctx, instr->src[0]);
2404          if (instr->src[0].src.ssa->bit_size <= 16)
2405             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2406          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2407       } else if (instr->src[0].src.ssa->bit_size == 64) {
2408          Temp src = get_alu_src(ctx, instr->src[0]);
2409          RegClass rc = RegClass(src.type(), 1);
2410          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2411          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2412          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2413          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2414          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2415          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2416       } else {
2417          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2418          nir_print_instr(&instr->instr, stderr);
2419          fprintf(stderr, "\n");
2420       }
2421       break;
2422    }
2423    case nir_op_f2i8:
2424    case nir_op_f2i16: {
2425       if (instr->src[0].src.ssa->bit_size == 16)
2426          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2427       else if (instr->src[0].src.ssa->bit_size == 32)
2428          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2429       else
2430          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2431       break;
2432    }
2433    case nir_op_f2u8:
2434    case nir_op_f2u16: {
2435       if (instr->src[0].src.ssa->bit_size == 16)
2436          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2437       else if (instr->src[0].src.ssa->bit_size == 32)
2438          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2439       else
2440          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2441       break;
2442    }
2443    case nir_op_f2i32: {
2444       Temp src = get_alu_src(ctx, instr->src[0]);
2445       if (instr->src[0].src.ssa->bit_size == 16) {
2446          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2447          if (dst.type() == RegType::vgpr) {
2448             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2449          } else {
2450             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2451                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2452          }
2453       } else if (instr->src[0].src.ssa->bit_size == 32) {
2454          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2455       } else if (instr->src[0].src.ssa->bit_size == 64) {
2456          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2457       } else {
2458          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2459          nir_print_instr(&instr->instr, stderr);
2460          fprintf(stderr, "\n");
2461       }
2462       break;
2463    }
2464    case nir_op_f2u32: {
2465       Temp src = get_alu_src(ctx, instr->src[0]);
2466       if (instr->src[0].src.ssa->bit_size == 16) {
2467          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2468          if (dst.type() == RegType::vgpr) {
2469             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2470          } else {
2471             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2472                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2473          }
2474       } else if (instr->src[0].src.ssa->bit_size == 32) {
2475          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2476       } else if (instr->src[0].src.ssa->bit_size == 64) {
2477          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2478       } else {
2479          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2480          nir_print_instr(&instr->instr, stderr);
2481          fprintf(stderr, "\n");
2482       }
2483       break;
2484    }
2485    case nir_op_f2i64: {
2486       Temp src = get_alu_src(ctx, instr->src[0]);
2487       if (instr->src[0].src.ssa->bit_size == 16)
2488          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2489
2490       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2491          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2492          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
2493          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2494          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2495          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2496          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
2497          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2498          Temp new_exponent = bld.tmp(v1);
2499          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
2500          if (ctx->program->chip_class >= GFX8)
2501             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2502          else
2503             mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2504          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
2505          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2506          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2507          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
2508          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2509          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2510          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2511          Temp new_lower = bld.tmp(v1);
2512          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2513          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2514          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2515
2516       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2517          if (src.type() == RegType::vgpr)
2518             src = bld.as_uniform(src);
2519          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2520          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2521          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2522          exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
2523          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2524          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2525          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2526          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
2527          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2528          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
2529          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2530          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
2531          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
2532          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2533          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2534          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2535          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2536          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2537          Temp borrow = bld.tmp(s1);
2538          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2539          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
2540          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2541
2542       } else if (instr->src[0].src.ssa->bit_size == 64) {
2543          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2544          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2545          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2546          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2547          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2548          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2549          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2550          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2551          if (dst.type() == RegType::sgpr) {
2552             lower = bld.as_uniform(lower);
2553             upper = bld.as_uniform(upper);
2554          }
2555          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2556
2557       } else {
2558          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2559          nir_print_instr(&instr->instr, stderr);
2560          fprintf(stderr, "\n");
2561       }
2562       break;
2563    }
2564    case nir_op_f2u64: {
2565       Temp src = get_alu_src(ctx, instr->src[0]);
2566       if (instr->src[0].src.ssa->bit_size == 16)
2567          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2568
2569       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2570          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2571          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
2572          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
2573          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2574          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2575          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
2576          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2577          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2578          Temp new_exponent = bld.tmp(v1);
2579          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
2580          if (ctx->program->chip_class >= GFX8)
2581             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2582          else
2583             mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2584          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2585          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2586          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2587          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
2588          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
2589          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
2590          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2591
2592       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2593          if (src.type() == RegType::vgpr)
2594             src = bld.as_uniform(src);
2595          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2596          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2597          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2598          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2599          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2600          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2601          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2602          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2603          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2604          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2605          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2606          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2607          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2608          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2609          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2610          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2611          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2612          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2613
2614       } else if (instr->src[0].src.ssa->bit_size == 64) {
2615          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2616          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2617          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2618          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2619          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2620          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2621          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2622          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2623          if (dst.type() == RegType::sgpr) {
2624             lower = bld.as_uniform(lower);
2625             upper = bld.as_uniform(upper);
2626          }
2627          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2628
2629       } else {
2630          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2631          nir_print_instr(&instr->instr, stderr);
2632          fprintf(stderr, "\n");
2633       }
2634       break;
2635    }
2636    case nir_op_b2f16: {
2637       Temp src = get_alu_src(ctx, instr->src[0]);
2638       assert(src.regClass() == bld.lm);
2639
2640       if (dst.regClass() == s1) {
2641          src = bool_to_scalar_condition(ctx, src);
2642          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
2643       } else if (dst.regClass() == v2b) {
2644          Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2645          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
2646       } else {
2647          unreachable("Wrong destination register class for nir_op_b2f16.");
2648       }
2649       break;
2650    }
2651    case nir_op_b2f32: {
2652       Temp src = get_alu_src(ctx, instr->src[0]);
2653       assert(src.regClass() == bld.lm);
2654
2655       if (dst.regClass() == s1) {
2656          src = bool_to_scalar_condition(ctx, src);
2657          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2658       } else if (dst.regClass() == v1) {
2659          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2660       } else {
2661          unreachable("Wrong destination register class for nir_op_b2f32.");
2662       }
2663       break;
2664    }
2665    case nir_op_b2f64: {
2666       Temp src = get_alu_src(ctx, instr->src[0]);
2667       assert(src.regClass() == bld.lm);
2668
2669       if (dst.regClass() == s2) {
2670          src = bool_to_scalar_condition(ctx, src);
2671          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2672       } else if (dst.regClass() == v2) {
2673          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2674          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2675          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2676       } else {
2677          unreachable("Wrong destination register class for nir_op_b2f64.");
2678       }
2679       break;
2680    }
2681    case nir_op_i2i8:
2682    case nir_op_i2i16:
2683    case nir_op_i2i32:
2684    case nir_op_i2i64: {
2685       convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2686                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
2687       break;
2688    }
2689    case nir_op_u2u8:
2690    case nir_op_u2u16:
2691    case nir_op_u2u32:
2692    case nir_op_u2u64: {
2693       convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2694                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
2695       break;
2696    }
2697    case nir_op_b2b32:
2698    case nir_op_b2i8:
2699    case nir_op_b2i16:
2700    case nir_op_b2i32:
2701    case nir_op_b2i64: {
2702       Temp src = get_alu_src(ctx, instr->src[0]);
2703       assert(src.regClass() == bld.lm);
2704
2705       Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
2706       if (tmp.regClass() == s1) {
2707          // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2708          bool_to_scalar_condition(ctx, src, tmp);
2709       } else if (tmp.type() == RegType::vgpr) {
2710          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
2711       } else {
2712          unreachable("Invalid register class for b2i32");
2713       }
2714
2715       if (tmp != dst)
2716          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
2717       break;
2718    }
2719    case nir_op_b2b1:
2720    case nir_op_i2b1: {
2721       Temp src = get_alu_src(ctx, instr->src[0]);
2722       assert(dst.regClass() == bld.lm);
2723
2724       if (src.type() == RegType::vgpr) {
2725          assert(src.regClass() == v1 || src.regClass() == v2);
2726          assert(dst.regClass() == bld.lm);
2727          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2728                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2729       } else {
2730          assert(src.regClass() == s1 || src.regClass() == s2);
2731          Temp tmp;
2732          if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2733             tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2734          } else {
2735             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2736                            bld.scc(bld.def(s1)), Operand(0u), src);
2737          }
2738          bool_to_vector_condition(ctx, tmp, dst);
2739       }
2740       break;
2741    }
2742    case nir_op_pack_64_2x32_split: {
2743       Temp src0 = get_alu_src(ctx, instr->src[0]);
2744       Temp src1 = get_alu_src(ctx, instr->src[1]);
2745
2746       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2747       break;
2748    }
2749    case nir_op_unpack_64_2x32_split_x:
2750       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2751       break;
2752    case nir_op_unpack_64_2x32_split_y:
2753       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2754       break;
2755    case nir_op_unpack_32_2x16_split_x:
2756       if (dst.type() == RegType::vgpr) {
2757          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2758       } else {
2759          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
2760       }
2761       break;
2762    case nir_op_unpack_32_2x16_split_y:
2763       if (dst.type() == RegType::vgpr) {
2764          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2765       } else {
2766          bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16)));
2767       }
2768       break;
2769    case nir_op_pack_32_2x16_split: {
2770       Temp src0 = get_alu_src(ctx, instr->src[0]);
2771       Temp src1 = get_alu_src(ctx, instr->src[1]);
2772       if (dst.regClass() == v1) {
2773          src0 = emit_extract_vector(ctx, src0, 0, v2b);
2774          src1 = emit_extract_vector(ctx, src1, 0, v2b);
2775          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2776       } else {
2777          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
2778          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
2779          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
2780       }
2781       break;
2782    }
2783    case nir_op_pack_half_2x16: {
2784       Temp src = get_alu_src(ctx, instr->src[0], 2);
2785
2786       if (dst.regClass() == v1) {
2787          Temp src0 = bld.tmp(v1);
2788          Temp src1 = bld.tmp(v1);
2789          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2790          if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2791             bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2792          else
2793             bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2794                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2795                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2796       } else {
2797          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2798          nir_print_instr(&instr->instr, stderr);
2799          fprintf(stderr, "\n");
2800       }
2801       break;
2802    }
2803    case nir_op_unpack_half_2x16_split_x: {
2804       if (dst.regClass() == v1) {
2805          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2806       } else {
2807          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2808          nir_print_instr(&instr->instr, stderr);
2809          fprintf(stderr, "\n");
2810       }
2811       break;
2812    }
2813    case nir_op_unpack_half_2x16_split_y: {
2814       if (dst.regClass() == v1) {
2815          /* TODO: use SDWA here */
2816          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2817                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2818       } else {
2819          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2820          nir_print_instr(&instr->instr, stderr);
2821          fprintf(stderr, "\n");
2822       }
2823       break;
2824    }
2825    case nir_op_fquantize2f16: {
2826       Temp src = get_alu_src(ctx, instr->src[0]);
2827       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2828       Temp f32, cmp_res;
2829
2830       if (ctx->program->chip_class >= GFX8) {
2831          Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2832          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2833          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2834       } else {
2835          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2836           * so compare the result and flush to 0 if it's smaller.
2837           */
2838          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2839          Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2840          Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
2841          static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2842          cmp_res = vop3->definitions[0].getTemp();
2843       }
2844
2845       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2846          Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2847          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2848       } else {
2849          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2850       }
2851       break;
2852    }
2853    case nir_op_bfm: {
2854       Temp bits = get_alu_src(ctx, instr->src[0]);
2855       Temp offset = get_alu_src(ctx, instr->src[1]);
2856
2857       if (dst.regClass() == s1) {
2858          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2859       } else if (dst.regClass() == v1) {
2860          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2861       } else {
2862          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2863          nir_print_instr(&instr->instr, stderr);
2864          fprintf(stderr, "\n");
2865       }
2866       break;
2867    }
2868    case nir_op_bitfield_select: {
2869       /* (mask & insert) | (~mask & base) */
2870       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2871       Temp insert = get_alu_src(ctx, instr->src[1]);
2872       Temp base = get_alu_src(ctx, instr->src[2]);
2873
2874       /* dst = (insert & bitmask) | (base & ~bitmask) */
2875       if (dst.regClass() == s1) {
2876          aco_ptr<Instruction> sop2;
2877          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2878          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2879          Operand lhs;
2880          if (const_insert && const_bitmask) {
2881             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2882          } else {
2883             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2884             lhs = Operand(insert);
2885          }
2886
2887          Operand rhs;
2888          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2889          if (const_base && const_bitmask) {
2890             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2891          } else {
2892             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2893             rhs = Operand(base);
2894          }
2895
2896          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2897
2898       } else if (dst.regClass() == v1) {
2899          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2900             base = as_vgpr(ctx, base);
2901          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2902             insert = as_vgpr(ctx, insert);
2903
2904          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2905
2906       } else {
2907          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2908          nir_print_instr(&instr->instr, stderr);
2909          fprintf(stderr, "\n");
2910       }
2911       break;
2912    }
2913    case nir_op_ubfe:
2914    case nir_op_ibfe: {
2915       Temp base = get_alu_src(ctx, instr->src[0]);
2916       Temp offset = get_alu_src(ctx, instr->src[1]);
2917       Temp bits = get_alu_src(ctx, instr->src[2]);
2918
2919       if (dst.type() == RegType::sgpr) {
2920          Operand extract;
2921          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2922          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2923          if (const_offset && const_bits) {
2924             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2925             extract = Operand(const_extract);
2926          } else {
2927             Operand width;
2928             if (const_bits) {
2929                width = Operand(const_bits->u32 << 16);
2930             } else {
2931                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2932             }
2933             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2934          }
2935
2936          aco_opcode opcode;
2937          if (dst.regClass() == s1) {
2938             if (instr->op == nir_op_ubfe)
2939                opcode = aco_opcode::s_bfe_u32;
2940             else
2941                opcode = aco_opcode::s_bfe_i32;
2942          } else if (dst.regClass() == s2) {
2943             if (instr->op == nir_op_ubfe)
2944                opcode = aco_opcode::s_bfe_u64;
2945             else
2946                opcode = aco_opcode::s_bfe_i64;
2947          } else {
2948             unreachable("Unsupported BFE bit size");
2949          }
2950
2951          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2952
2953       } else {
2954          aco_opcode opcode;
2955          if (dst.regClass() == v1) {
2956             if (instr->op == nir_op_ubfe)
2957                opcode = aco_opcode::v_bfe_u32;
2958             else
2959                opcode = aco_opcode::v_bfe_i32;
2960          } else {
2961             unreachable("Unsupported BFE bit size");
2962          }
2963
2964          emit_vop3a_instruction(ctx, instr, opcode, dst);
2965       }
2966       break;
2967    }
2968    case nir_op_bit_count: {
2969       Temp src = get_alu_src(ctx, instr->src[0]);
2970       if (src.regClass() == s1) {
2971          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2972       } else if (src.regClass() == v1) {
2973          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2974       } else if (src.regClass() == v2) {
2975          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2976                   emit_extract_vector(ctx, src, 1, v1),
2977                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2978                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2979       } else if (src.regClass() == s2) {
2980          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2981       } else {
2982          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2983          nir_print_instr(&instr->instr, stderr);
2984          fprintf(stderr, "\n");
2985       }
2986       break;
2987    }
2988    case nir_op_flt: {
2989       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2990       break;
2991    }
2992    case nir_op_fge: {
2993       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2994       break;
2995    }
2996    case nir_op_feq: {
2997       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2998       break;
2999    }
3000    case nir_op_fne: {
3001       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
3002       break;
3003    }
3004    case nir_op_ilt: {
3005       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
3006       break;
3007    }
3008    case nir_op_ige: {
3009       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3010       break;
3011    }
3012    case nir_op_ieq: {
3013       if (instr->src[0].src.ssa->bit_size == 1)
3014          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3015       else
3016          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3017                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3018       break;
3019    }
3020    case nir_op_ine: {
3021       if (instr->src[0].src.ssa->bit_size == 1)
3022          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3023       else
3024          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3025                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3026       break;
3027    }
3028    case nir_op_ult: {
3029       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3030       break;
3031    }
3032    case nir_op_uge: {
3033       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3034       break;
3035    }
3036    case nir_op_fddx:
3037    case nir_op_fddy:
3038    case nir_op_fddx_fine:
3039    case nir_op_fddy_fine:
3040    case nir_op_fddx_coarse:
3041    case nir_op_fddy_coarse: {
3042       Temp src = get_alu_src(ctx, instr->src[0]);
3043       uint16_t dpp_ctrl1, dpp_ctrl2;
3044       if (instr->op == nir_op_fddx_fine) {
3045          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3046          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3047       } else if (instr->op == nir_op_fddy_fine) {
3048          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3049          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3050       } else {
3051          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3052          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3053             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3054          else
3055             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3056       }
3057
3058       Temp tmp;
3059       if (ctx->program->chip_class >= GFX8) {
3060          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3061          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3062       } else {
3063          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3064          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3065          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3066       }
3067       emit_wqm(ctx, tmp, dst, true);
3068       break;
3069    }
3070    default:
3071       fprintf(stderr, "Unknown NIR ALU instr: ");
3072       nir_print_instr(&instr->instr, stderr);
3073       fprintf(stderr, "\n");
3074    }
3075 }
3076
3077 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
3078 {
3079    Temp dst = get_ssa_temp(ctx, &instr->def);
3080
3081    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3082    // which get truncated the lsb if double and msb if int
3083    // for now, we only use s_mov_b64 with 64bit inline constants
3084    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3085    assert(dst.type() == RegType::sgpr);
3086
3087    Builder bld(ctx->program, ctx->block);
3088
3089    if (instr->def.bit_size == 1) {
3090       assert(dst.regClass() == bld.lm);
3091       int val = instr->value[0].b ? -1 : 0;
3092       Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
3093       bld.sop1(Builder::s_mov, Definition(dst), op);
3094    } else if (instr->def.bit_size == 8) {
3095       /* ensure that the value is correctly represented in the low byte of the register */
3096       bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8);
3097    } else if (instr->def.bit_size == 16) {
3098       /* ensure that the value is correctly represented in the low half of the register */
3099       bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16);
3100    } else if (dst.size() == 1) {
3101       bld.copy(Definition(dst), Operand(instr->value[0].u32));
3102    } else {
3103       assert(dst.size() != 1);
3104       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3105       if (instr->def.bit_size == 64)
3106          for (unsigned i = 0; i < dst.size(); i++)
3107             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
3108       else {
3109          for (unsigned i = 0; i < dst.size(); i++)
3110             vec->operands[i] = Operand{instr->value[i].u32};
3111       }
3112       vec->definitions[0] = Definition(dst);
3113       ctx->block->instructions.emplace_back(std::move(vec));
3114    }
3115 }
3116
3117 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
3118 {
3119    uint32_t new_mask = 0;
3120    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3121       if (mask & (1u << i))
3122          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3123    return new_mask;
3124 }
3125
3126 struct LoadEmitInfo {
3127    Operand offset;
3128    Temp dst;
3129    unsigned num_components;
3130    unsigned component_size;
3131    Temp resource = Temp(0, s1);
3132    unsigned component_stride = 0;
3133    unsigned const_offset = 0;
3134    unsigned align_mul = 0;
3135    unsigned align_offset = 0;
3136
3137    bool glc = false;
3138    unsigned swizzle_component_size = 0;
3139    barrier_interaction barrier = barrier_none;
3140    bool can_reorder = true;
3141    Temp soffset = Temp(0, s1);
3142 };
3143
3144 using LoadCallback = Temp(*)(
3145    Builder& bld, const LoadEmitInfo* info, Temp offset, unsigned bytes_needed,
3146    unsigned align, unsigned const_offset, Temp dst_hint);
3147
3148 template <LoadCallback callback, bool byte_align_loads, bool supports_8bit_16bit_loads, unsigned max_const_offset_plus_one>
3149 void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
3150 {
3151    unsigned load_size = info->num_components * info->component_size;
3152    unsigned component_size = info->component_size;
3153
3154    unsigned num_vals = 0;
3155    Temp vals[info->dst.bytes()];
3156
3157    unsigned const_offset = info->const_offset;
3158
3159    unsigned align_mul = info->align_mul ? info->align_mul : component_size;
3160    unsigned align_offset = (info->align_offset + const_offset) % align_mul;
3161
3162    unsigned bytes_read = 0;
3163    while (bytes_read < load_size) {
3164       unsigned bytes_needed = load_size - bytes_read;
3165
3166       /* add buffer for unaligned loads */
3167       int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3168
3169       if (byte_align) {
3170          if ((bytes_needed > 2 ||
3171               (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3172               !supports_8bit_16bit_loads) && byte_align_loads) {
3173             if (info->component_stride) {
3174                assert(supports_8bit_16bit_loads && "unimplemented");
3175                bytes_needed = 2;
3176                byte_align = 0;
3177             } else {
3178                bytes_needed += byte_align == -1 ? 4 - info->align_mul : byte_align;
3179                bytes_needed = align(bytes_needed, 4);
3180             }
3181          } else {
3182             byte_align = 0;
3183          }
3184       }
3185
3186       if (info->swizzle_component_size)
3187          bytes_needed = MIN2(bytes_needed, info->swizzle_component_size);
3188       if (info->component_stride)
3189          bytes_needed = MIN2(bytes_needed, info->component_size);
3190
3191       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3192
3193       /* reduce constant offset */
3194       Operand offset = info->offset;
3195       unsigned reduced_const_offset = const_offset;
3196       bool remove_const_offset_completely = need_to_align_offset;
3197       if (const_offset && (remove_const_offset_completely || const_offset >= max_const_offset_plus_one)) {
3198          unsigned to_add = const_offset;
3199          if (remove_const_offset_completely) {
3200             reduced_const_offset = 0;
3201          } else {
3202             to_add = const_offset / max_const_offset_plus_one * max_const_offset_plus_one;
3203             reduced_const_offset %= max_const_offset_plus_one;
3204          }
3205          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3206          if (offset.isConstant()) {
3207             offset = Operand(offset.constantValue() + to_add);
3208          } else if (offset_tmp.regClass() == s1) {
3209             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3210                               offset_tmp, Operand(to_add));
3211          } else if (offset_tmp.regClass() == v1) {
3212             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add));
3213          } else {
3214             Temp lo = bld.tmp(offset_tmp.type(), 1);
3215             Temp hi = bld.tmp(offset_tmp.type(), 1);
3216             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3217
3218             if (offset_tmp.regClass() == s2) {
3219                Temp carry = bld.tmp(s1);
3220                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add));
3221                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3222                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3223             } else {
3224                Temp new_lo = bld.tmp(v1);
3225                Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp();
3226                hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry);
3227                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3228             }
3229          }
3230       }
3231
3232       /* align offset down if needed */
3233       Operand aligned_offset = offset;
3234       if (need_to_align_offset) {
3235          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3236          if (offset.isConstant()) {
3237             aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
3238          } else if (offset_tmp.regClass() == s1) {
3239             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp);
3240          } else if (offset_tmp.regClass() == s2) {
3241             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp);
3242          } else if (offset_tmp.regClass() == v1) {
3243             aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp);
3244          } else if (offset_tmp.regClass() == v2) {
3245             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3246             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3247             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), lo);
3248             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3249          }
3250       }
3251       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
3252                                 bld.copy(bld.def(s1), aligned_offset);
3253
3254       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3255       Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3256                           reduced_const_offset, byte_align ? Temp() : info->dst);
3257
3258       /* the callback wrote directly to dst */
3259       if (val == info->dst) {
3260          assert(num_vals == 0);
3261          emit_split_vector(ctx, info->dst, info->num_components);
3262          return;
3263       }
3264
3265       /* shift result right if needed */
3266       if (info->component_size < 4 && byte_align_loads) {
3267          Operand align((uint32_t)byte_align);
3268          if (byte_align == -1) {
3269             if (offset.isConstant())
3270                align = Operand(offset.constantValue() % 4u);
3271             else if (offset.size() == 2)
3272                align = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1)));
3273             else
3274                align = offset;
3275          }
3276
3277          assert(val.bytes() >= load_size && "unimplemented");
3278          if (val.type() == RegType::sgpr)
3279             byte_align_scalar(ctx, val, align, info->dst);
3280          else
3281             byte_align_vector(ctx, val, align, info->dst, component_size);
3282          return;
3283       }
3284
3285       /* add result to list and advance */
3286       if (info->component_stride) {
3287          assert(val.bytes() == info->component_size && "unimplemented");
3288          const_offset += info->component_stride;
3289          align_offset = (align_offset + info->component_stride) % align_mul;
3290       } else {
3291          const_offset += val.bytes();
3292          align_offset = (align_offset + val.bytes()) % align_mul;
3293       }
3294       bytes_read += val.bytes();
3295       vals[num_vals++] = val;
3296    }
3297
3298    /* create array of components */
3299    unsigned components_split = 0;
3300    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3301    bool has_vgprs = false;
3302    for (unsigned i = 0; i < num_vals;) {
3303       Temp tmp[num_vals];
3304       unsigned num_tmps = 0;
3305       unsigned tmp_size = 0;
3306       RegType reg_type = RegType::sgpr;
3307       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3308          if (vals[i].type() == RegType::vgpr)
3309             reg_type = RegType::vgpr;
3310          tmp_size += vals[i].bytes();
3311          tmp[num_tmps++] = vals[i++];
3312       }
3313       if (num_tmps > 1) {
3314          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3315             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3316          for (unsigned i = 0; i < num_vals; i++)
3317             vec->operands[i] = Operand(tmp[i]);
3318          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3319          vec->definitions[0] = Definition(tmp[0]);
3320          bld.insert(std::move(vec));
3321       }
3322
3323       if (tmp[0].bytes() % component_size) {
3324          /* trim tmp[0] */
3325          assert(i == num_vals);
3326          RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3327          tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u));
3328       }
3329
3330       RegClass elem_rc = RegClass::get(reg_type, component_size);
3331
3332       unsigned start = components_split;
3333
3334       if (tmp_size == elem_rc.bytes()) {
3335          allocated_vec[components_split++] = tmp[0];
3336       } else {
3337          assert(tmp_size % elem_rc.bytes() == 0);
3338          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3339             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3340          for (unsigned i = 0; i < split->definitions.size(); i++) {
3341             Temp component = bld.tmp(elem_rc);
3342             allocated_vec[components_split++] = component;
3343             split->definitions[i] = Definition(component);
3344          }
3345          split->operands[0] = Operand(tmp[0]);
3346          bld.insert(std::move(split));
3347       }
3348
3349       /* try to p_as_uniform early so we can create more optimizable code and
3350        * also update allocated_vec */
3351       for (unsigned j = start; j < components_split; j++) {
3352          if (allocated_vec[j].bytes() % 4 == 0 && info->dst.type() == RegType::sgpr)
3353             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3354          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3355       }
3356    }
3357
3358    /* concatenate components and p_as_uniform() result if needed */
3359    if (info->dst.type() == RegType::vgpr || !has_vgprs)
3360       ctx->allocated_vec.emplace(info->dst.id(), allocated_vec);
3361
3362    int padding_bytes = MAX2((int)info->dst.bytes() - int(allocated_vec[0].bytes() * info->num_components), 0);
3363
3364    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3365       aco_opcode::p_create_vector, Format::PSEUDO, info->num_components + !!padding_bytes, 1)};
3366    for (unsigned i = 0; i < info->num_components; i++)
3367       vec->operands[i] = Operand(allocated_vec[i]);
3368    if (padding_bytes)
3369       vec->operands[info->num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3370    if (info->dst.type() == RegType::sgpr && has_vgprs) {
3371       Temp tmp = bld.tmp(RegType::vgpr, info->dst.size());
3372       vec->definitions[0] = Definition(tmp);
3373       bld.insert(std::move(vec));
3374       bld.pseudo(aco_opcode::p_as_uniform, Definition(info->dst), tmp);
3375    } else {
3376       vec->definitions[0] = Definition(info->dst);
3377       bld.insert(std::move(vec));
3378    }
3379 }
3380
3381 Operand load_lds_size_m0(Builder& bld)
3382 {
3383    /* TODO: m0 does not need to be initialized on GFX9+ */
3384    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
3385 }
3386
3387 Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
3388                        Temp offset, unsigned bytes_needed,
3389                        unsigned align, unsigned const_offset,
3390                        Temp dst_hint)
3391 {
3392    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3393
3394    Operand m = load_lds_size_m0(bld);
3395
3396    bool large_ds_read = bld.program->chip_class >= GFX7;
3397    bool usable_read2 = bld.program->chip_class >= GFX7;
3398
3399    bool read2 = false;
3400    unsigned size = 0;
3401    aco_opcode op;
3402    //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
3403    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3404       size = 16;
3405       op = aco_opcode::ds_read_b128;
3406    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3407       size = 16;
3408       read2 = true;
3409       op = aco_opcode::ds_read2_b64;
3410    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3411       size = 12;
3412       op = aco_opcode::ds_read_b96;
3413    } else if (bytes_needed >= 8 && align % 8 == 0) {
3414       size = 8;
3415       op = aco_opcode::ds_read_b64;
3416    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
3417       size = 8;
3418       read2 = true;
3419       op = aco_opcode::ds_read2_b32;
3420    } else if (bytes_needed >= 4 && align % 4 == 0) {
3421       size = 4;
3422       op = aco_opcode::ds_read_b32;
3423    } else if (bytes_needed >= 2 && align % 2 == 0) {
3424       size = 2;
3425       op = aco_opcode::ds_read_u16;
3426    } else {
3427       size = 1;
3428       op = aco_opcode::ds_read_u8;
3429    }
3430
3431    unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536;
3432    if (const_offset >= max_offset_plus_one) {
3433       offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one));
3434       const_offset %= max_offset_plus_one;
3435    }
3436
3437    if (read2)
3438       const_offset /= (size / 2u);
3439
3440    RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
3441    Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3442    if (read2)
3443       bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3444    else
3445       bld.ds(op, Definition(val), offset, m, const_offset);
3446
3447    if (size < 4)
3448       val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
3449
3450    return val;
3451 }
3452
3453 static auto emit_lds_load = emit_load<lds_load_callback, false, true, UINT32_MAX>;
3454
3455 Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
3456                         Temp offset, unsigned bytes_needed,
3457                         unsigned align, unsigned const_offset,
3458                         Temp dst_hint)
3459 {
3460    unsigned size = 0;
3461    aco_opcode op;
3462    if (bytes_needed <= 4) {
3463       size = 1;
3464       op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3465    } else if (bytes_needed <= 8) {
3466       size = 2;
3467       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3468    } else if (bytes_needed <= 16) {
3469       size = 4;
3470       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
3471    } else if (bytes_needed <= 32) {
3472       size = 8;
3473       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
3474    } else {
3475       size = 16;
3476       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
3477    }
3478    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3479    if (info->resource.id()) {
3480       load->operands[0] = Operand(info->resource);
3481       load->operands[1] = Operand(offset);
3482    } else {
3483       load->operands[0] = Operand(offset);
3484       load->operands[1] = Operand(0u);
3485    }
3486    RegClass rc(RegType::sgpr, size);
3487    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
3488    load->definitions[0] = Definition(val);
3489    load->glc = info->glc;
3490    load->dlc = info->glc && bld.program->chip_class >= GFX10;
3491    load->barrier = info->barrier;
3492    load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
3493    bld.insert(std::move(load));
3494    return val;
3495 }
3496
3497 static auto emit_smem_load = emit_load<smem_load_callback, true, false, 1024>;
3498
3499 Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
3500                          Temp offset, unsigned bytes_needed,
3501                          unsigned align_, unsigned const_offset,
3502                          Temp dst_hint)
3503 {
3504    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3505    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3506
3507    if (info->soffset.id()) {
3508       if (soffset.isTemp())
3509          vaddr = bld.copy(bld.def(v1), soffset);
3510       soffset = Operand(info->soffset);
3511    }
3512
3513    unsigned bytes_size = 0;
3514    aco_opcode op;
3515    if (bytes_needed == 1) {
3516       bytes_size = 1;
3517       op = aco_opcode::buffer_load_ubyte;
3518    } else if (bytes_needed == 2) {
3519       bytes_size = 2;
3520       op = aco_opcode::buffer_load_ushort;
3521    } else if (bytes_needed <= 4) {
3522       bytes_size = 4;
3523       op = aco_opcode::buffer_load_dword;
3524    } else if (bytes_needed <= 8) {
3525       bytes_size = 8;
3526       op = aco_opcode::buffer_load_dwordx2;
3527    } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
3528       bytes_size = 12;
3529       op = aco_opcode::buffer_load_dwordx3;
3530    } else {
3531       bytes_size = 16;
3532       op = aco_opcode::buffer_load_dwordx4;
3533    }
3534    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3535    mubuf->operands[0] = Operand(info->resource);
3536    mubuf->operands[1] = vaddr;
3537    mubuf->operands[2] = soffset;
3538    mubuf->offen = (offset.type() == RegType::vgpr);
3539    mubuf->glc = info->glc;
3540    mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
3541    mubuf->barrier = info->barrier;
3542    mubuf->can_reorder = info->can_reorder;
3543    mubuf->offset = const_offset;
3544    mubuf->swizzled = info->swizzle_component_size != 0;
3545    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3546    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3547    mubuf->definitions[0] = Definition(val);
3548    bld.insert(std::move(mubuf));
3549
3550    return val;
3551 }
3552
3553 static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
3554
3555 Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
3556 {
3557    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3558                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3559
3560    if (addr.type() == RegType::vgpr)
3561       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
3562    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
3563 }
3564
3565 Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
3566                           Temp offset, unsigned bytes_needed,
3567                           unsigned align_, unsigned const_offset,
3568                           Temp dst_hint)
3569 {
3570    unsigned bytes_size = 0;
3571    bool mubuf = bld.program->chip_class == GFX6;
3572    bool global = bld.program->chip_class >= GFX9;
3573    aco_opcode op;
3574    if (bytes_needed == 1) {
3575       bytes_size = 1;
3576       op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
3577    } else if (bytes_needed == 2) {
3578       bytes_size = 2;
3579       op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
3580    } else if (bytes_needed <= 4) {
3581       bytes_size = 4;
3582       op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
3583    } else if (bytes_needed <= 8) {
3584       bytes_size = 8;
3585       op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
3586    } else if (bytes_needed <= 12 && !mubuf) {
3587       bytes_size = 12;
3588       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
3589    } else {
3590       bytes_size = 16;
3591       op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
3592    }
3593    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3594    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3595    if (mubuf) {
3596       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3597       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
3598       mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3599       mubuf->operands[2] = Operand(0u);
3600       mubuf->glc = info->glc;
3601       mubuf->dlc = false;
3602       mubuf->offset = 0;
3603       mubuf->addr64 = offset.type() == RegType::vgpr;
3604       mubuf->disable_wqm = false;
3605       mubuf->barrier = info->barrier;
3606       mubuf->definitions[0] = Definition(val);
3607       bld.insert(std::move(mubuf));
3608    } else {
3609       offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
3610
3611       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
3612       flat->operands[0] = Operand(offset);
3613       flat->operands[1] = Operand(s1);
3614       flat->glc = info->glc;
3615       flat->dlc = info->glc && bld.program->chip_class >= GFX10;
3616       flat->barrier = info->barrier;
3617       flat->offset = 0u;
3618       flat->definitions[0] = Definition(val);
3619       bld.insert(std::move(flat));
3620    }
3621
3622    return val;
3623 }
3624
3625 static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
3626
3627 Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
3628               Temp address, unsigned base_offset, unsigned align)
3629 {
3630    assert(util_is_power_of_two_nonzero(align));
3631
3632    Builder bld(ctx->program, ctx->block);
3633
3634    unsigned num_components = dst.bytes() / elem_size_bytes;
3635    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
3636    info.align_mul = align;
3637    info.align_offset = 0;
3638    info.barrier = barrier_shared;
3639    info.can_reorder = false;
3640    info.const_offset = base_offset;
3641    emit_lds_load(ctx, bld, &info);
3642
3643    return dst;
3644 }
3645
3646 void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
3647 {
3648    if (!count)
3649       return;
3650
3651    Builder bld(ctx->program, ctx->block);
3652
3653    ASSERTED bool is_subdword = false;
3654    for (unsigned i = 0; i < count; i++)
3655       is_subdword |= offsets[i] % 4;
3656    is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
3657    assert(!is_subdword || dst_type == RegType::vgpr);
3658
3659    /* count == 1 fast path */
3660    if (count == 1) {
3661       if (dst_type == RegType::sgpr)
3662          dst[0] = bld.as_uniform(src);
3663       else
3664          dst[0] = as_vgpr(ctx, src);
3665       return;
3666    }
3667
3668    for (unsigned i = 0; i < count - 1; i++)
3669       dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
3670    dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
3671
3672    if (is_subdword && src.type() == RegType::sgpr) {
3673       src = as_vgpr(ctx, src);
3674    } else {
3675       /* use allocated_vec if possible */
3676       auto it = ctx->allocated_vec.find(src.id());
3677       if (it != ctx->allocated_vec.end()) {
3678          unsigned total_size = 0;
3679          for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
3680             total_size += it->second[i].bytes();
3681          if (total_size != src.bytes())
3682             goto split;
3683
3684          unsigned elem_size = it->second[0].bytes();
3685
3686          for (unsigned i = 0; i < count; i++) {
3687             if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
3688                goto split;
3689          }
3690
3691          for (unsigned i = 0; i < count; i++) {
3692             unsigned start_idx = offsets[i] / elem_size;
3693             unsigned op_count = dst[i].bytes() / elem_size;
3694             if (op_count == 1) {
3695                if (dst_type == RegType::sgpr)
3696                   dst[i] = bld.as_uniform(it->second[start_idx]);
3697                else
3698                   dst[i] = as_vgpr(ctx, it->second[start_idx]);
3699                continue;
3700             }
3701
3702             aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
3703             for (unsigned j = 0; j < op_count; j++) {
3704                Temp tmp = it->second[start_idx + j];
3705                if (dst_type == RegType::sgpr)
3706                   tmp = bld.as_uniform(tmp);
3707                vec->operands[j] = Operand(tmp);
3708             }
3709             vec->definitions[0] = Definition(dst[i]);
3710             bld.insert(std::move(vec));
3711          }
3712          return;
3713       }
3714    }
3715
3716    if (dst_type == RegType::sgpr)
3717       src = bld.as_uniform(src);
3718
3719    split:
3720    /* just split it */
3721    aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
3722    split->operands[0] = Operand(src);
3723    for (unsigned i = 0; i < count; i++)
3724       split->definitions[i] = Definition(dst[i]);
3725    bld.insert(std::move(split));
3726 }
3727
3728 bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
3729                      int *start, int *count)
3730 {
3731    unsigned start_elem = ffs(todo_mask) - 1;
3732    bool skip = !(mask & (1 << start_elem));
3733    if (skip)
3734       mask = ~mask & todo_mask;
3735
3736    mask &= todo_mask;
3737
3738    u_bit_scan_consecutive_range(&mask, start, count);
3739
3740    return !skip;
3741 }
3742
3743 void advance_write_mask(uint32_t *todo_mask, int start, int count)
3744 {
3745    *todo_mask &= ~u_bit_consecutive(0, count) << start;
3746 }
3747
3748 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
3749                Temp address, unsigned base_offset, unsigned align)
3750 {
3751    assert(util_is_power_of_two_nonzero(align));
3752    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
3753
3754    Builder bld(ctx->program, ctx->block);
3755    bool large_ds_write = ctx->options->chip_class >= GFX7;
3756    bool usable_write2 = ctx->options->chip_class >= GFX7;
3757
3758    unsigned write_count = 0;
3759    Temp write_datas[32];
3760    unsigned offsets[32];
3761    aco_opcode opcodes[32];
3762
3763    wrmask = widen_mask(wrmask, elem_size_bytes);
3764
3765    uint32_t todo = u_bit_consecutive(0, data.bytes());
3766    while (todo) {
3767       int offset, bytes;
3768       if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
3769          offsets[write_count] = offset;
3770          opcodes[write_count] = aco_opcode::num_opcodes;
3771          write_count++;
3772          advance_write_mask(&todo, offset, bytes);
3773          continue;
3774       }
3775
3776       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
3777       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
3778       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
3779       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
3780
3781       //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
3782       aco_opcode op = aco_opcode::num_opcodes;
3783       if (bytes >= 16 && aligned16 && large_ds_write) {
3784          op = aco_opcode::ds_write_b128;
3785          bytes = 16;
3786       } else if (bytes >= 12 && aligned16 && large_ds_write) {
3787          op = aco_opcode::ds_write_b96;
3788          bytes = 12;
3789       } else if (bytes >= 8 && aligned8) {
3790          op = aco_opcode::ds_write_b64;
3791          bytes = 8;
3792       } else if (bytes >= 4 && aligned4) {
3793          op = aco_opcode::ds_write_b32;
3794          bytes = 4;
3795       } else if (bytes >= 2 && aligned2) {
3796          op = aco_opcode::ds_write_b16;
3797          bytes = 2;
3798       } else if (bytes >= 1) {
3799          op = aco_opcode::ds_write_b8;
3800          bytes = 1;
3801       } else {
3802          assert(false);
3803       }
3804
3805       offsets[write_count] = offset;
3806       opcodes[write_count] = op;
3807       write_count++;
3808       advance_write_mask(&todo, offset, bytes);
3809    }
3810
3811    Operand m = load_lds_size_m0(bld);
3812
3813    split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
3814
3815    for (unsigned i = 0; i < write_count; i++) {
3816       aco_opcode op = opcodes[i];
3817       if (op == aco_opcode::num_opcodes)
3818          continue;
3819
3820       Temp data = write_datas[i];
3821
3822       unsigned second = write_count;
3823       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
3824          for (second = i + 1; second < write_count; second++) {
3825             if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
3826                op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
3827                opcodes[second] = aco_opcode::num_opcodes;
3828                break;
3829             }
3830          }
3831       }
3832
3833       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
3834       unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
3835
3836       unsigned inline_offset = base_offset + offsets[i];
3837       unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
3838       Temp address_offset = address;
3839       if (inline_offset > max_offset) {
3840          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
3841          inline_offset = offsets[i];
3842       }
3843       assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
3844
3845       if (write2) {
3846          Temp second_data = write_datas[second];
3847          inline_offset /= data.bytes();
3848          bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
3849       } else {
3850          bld.ds(op, address_offset, data, m, inline_offset);
3851       }
3852    }
3853 }
3854
3855 unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
3856 {
3857    unsigned align = 16;
3858    if (const_offset)
3859       align = std::min(align, 1u << (ffs(const_offset) - 1));
3860
3861    return align;
3862 }
3863
3864
3865 aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
3866 {
3867    switch (bytes) {
3868    case 1:
3869       assert(!smem);
3870       return aco_opcode::buffer_store_byte;
3871    case 2:
3872       assert(!smem);
3873       return aco_opcode::buffer_store_short;
3874    case 4:
3875       return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
3876    case 8:
3877       return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
3878    case 12:
3879       assert(!smem);
3880       return aco_opcode::buffer_store_dwordx3;
3881    case 16:
3882       return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
3883    }
3884    unreachable("Unexpected store size");
3885    return aco_opcode::num_opcodes;
3886 }
3887
3888 void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
3889                         Temp data, unsigned writemask, int swizzle_element_size,
3890                         unsigned *write_count, Temp *write_datas, unsigned *offsets)
3891 {
3892    unsigned write_count_with_skips = 0;
3893    bool skips[16];
3894
3895    /* determine how to split the data */
3896    unsigned todo = u_bit_consecutive(0, data.bytes());
3897    while (todo) {
3898       int offset, bytes;
3899       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
3900       offsets[write_count_with_skips] = offset;
3901       if (skips[write_count_with_skips]) {
3902          advance_write_mask(&todo, offset, bytes);
3903          write_count_with_skips++;
3904          continue;
3905       }
3906
3907       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
3908        * larger than swizzle_element_size */
3909       bytes = MIN2(bytes, swizzle_element_size);
3910       if (bytes % 4)
3911          bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
3912
3913       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
3914       if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
3915          bytes = 8;
3916
3917       /* dword or larger stores have to be dword-aligned */
3918       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
3919       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
3920       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
3921       if (!dword_aligned)
3922          bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
3923
3924       advance_write_mask(&todo, offset, bytes);
3925       write_count_with_skips++;
3926    }
3927
3928    /* actually split data */
3929    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
3930
3931    /* remove skips */
3932    for (unsigned i = 0; i < write_count_with_skips; i++) {
3933       if (skips[i])
3934          continue;
3935       write_datas[*write_count] = write_datas[i];
3936       offsets[*write_count] = offsets[i];
3937       (*write_count)++;
3938    }
3939 }
3940
3941 Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
3942                            unsigned split_cnt = 0u, Temp dst = Temp())
3943 {
3944    Builder bld(ctx->program, ctx->block);
3945    unsigned dword_size = elem_size_bytes / 4;
3946
3947    if (!dst.id())
3948       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
3949
3950    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3951    aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
3952    instr->definitions[0] = Definition(dst);
3953
3954    for (unsigned i = 0; i < cnt; ++i) {
3955       if (arr[i].id()) {
3956          assert(arr[i].size() == dword_size);
3957          allocated_vec[i] = arr[i];
3958          instr->operands[i] = Operand(arr[i]);
3959       } else {
3960          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
3961          allocated_vec[i] = zero;
3962          instr->operands[i] = Operand(zero);
3963       }
3964    }
3965
3966    bld.insert(std::move(instr));
3967
3968    if (split_cnt)
3969       emit_split_vector(ctx, dst, split_cnt);
3970    else
3971       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
3972
3973    return dst;
3974 }
3975
3976 inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
3977 {
3978    if (const_offset >= 4096) {
3979       unsigned excess_const_offset = const_offset / 4096u * 4096u;
3980       const_offset %= 4096u;
3981
3982       if (!voffset.id())
3983          voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
3984       else if (unlikely(voffset.regClass() == s1))
3985          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
3986       else if (likely(voffset.regClass() == v1))
3987          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
3988       else
3989          unreachable("Unsupported register class of voffset");
3990    }
3991
3992    return const_offset;
3993 }
3994
3995 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
3996                              unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
3997                              bool swizzled = false)
3998 {
3999    assert(vdata.id());
4000    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
4001    assert(vdata.size() >= 1 && vdata.size() <= 4);
4002
4003    Builder bld(ctx->program, ctx->block);
4004    aco_opcode op = get_buffer_store_op(false, vdata.bytes());
4005    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
4006
4007    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
4008    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
4009    Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
4010                                  /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
4011                                  /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
4012                                  /* dlc*/ false, /* slc */ slc);
4013
4014    static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
4015 }
4016
4017 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4018                                    unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4019                                    bool allow_combining = true, bool reorder = true, bool slc = false)
4020 {
4021    Builder bld(ctx->program, ctx->block);
4022    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4023    assert(write_mask);
4024    write_mask = widen_mask(write_mask, elem_size_bytes);
4025
4026    unsigned write_count = 0;
4027    Temp write_datas[32];
4028    unsigned offsets[32];
4029    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
4030                       allow_combining ? 16 : 4, &write_count, write_datas, offsets);
4031
4032    for (unsigned i = 0; i < write_count; i++) {
4033       unsigned const_offset = offsets[i] + base_const_offset;
4034       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
4035    }
4036 }
4037
4038 void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4039                      unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4040                      unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
4041 {
4042    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4043    assert((num_components * elem_size_bytes) == dst.bytes());
4044    assert(!!stride != allow_combining);
4045
4046    Builder bld(ctx->program, ctx->block);
4047
4048    LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4049    info.component_stride = allow_combining ? 0 : stride;
4050    info.glc = true;
4051    info.swizzle_component_size = allow_combining ? 0 : 4;
4052    info.align_mul = MIN2(elem_size_bytes, 4);
4053    info.align_offset = 0;
4054    info.soffset = soffset;
4055    info.const_offset = base_const_offset;
4056    emit_mubuf_load(ctx, bld, &info);
4057 }
4058
4059 std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
4060 {
4061    Builder bld(ctx->program, ctx->block);
4062    Temp offset = base_offset.first;
4063    unsigned const_offset = base_offset.second;
4064
4065    if (!nir_src_is_const(*off_src)) {
4066       Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
4067       Temp with_stride;
4068
4069       /* Calculate indirect offset with stride */
4070       if (likely(indirect_offset_arg.regClass() == v1))
4071          with_stride = bld.v_mul24_imm(bld.def(v1), indirect_offset_arg, stride);
4072       else if (indirect_offset_arg.regClass() == s1)
4073          with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
4074       else
4075          unreachable("Unsupported register class of indirect offset");
4076
4077       /* Add to the supplied base offset */
4078       if (offset.id() == 0)
4079          offset = with_stride;
4080       else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
4081          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
4082       else if (offset.size() == 1 && with_stride.size() == 1)
4083          offset = bld.vadd32(bld.def(v1), with_stride, offset);
4084       else
4085          unreachable("Unsupported register class of indirect offset");
4086    } else {
4087       unsigned const_offset_arg = nir_src_as_uint(*off_src);
4088       const_offset += const_offset_arg * stride;
4089    }
4090
4091    return std::make_pair(offset, const_offset);
4092 }
4093
4094 std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
4095 {
4096    Builder bld(ctx->program, ctx->block);
4097    Temp offset;
4098
4099    if (off1.first.id() && off2.first.id()) {
4100       if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
4101          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
4102       else if (off1.first.size() == 1 && off2.first.size() == 1)
4103          offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
4104       else
4105          unreachable("Unsupported register class of indirect offset");
4106    } else {
4107       offset = off1.first.id() ? off1.first : off2.first;
4108    }
4109
4110    return std::make_pair(offset, off1.second + off2.second);
4111 }
4112
4113 std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
4114 {
4115    Builder bld(ctx->program, ctx->block);
4116    unsigned const_offset = offs.second * multiplier;
4117
4118    if (!offs.first.id())
4119       return std::make_pair(offs.first, const_offset);
4120
4121    Temp offset = unlikely(offs.first.regClass() == s1)
4122                  ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
4123                  : bld.v_mul24_imm(bld.def(v1), offs.first, multiplier);
4124
4125    return std::make_pair(offset, const_offset);
4126 }
4127
4128 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
4129 {
4130    Builder bld(ctx->program, ctx->block);
4131
4132    /* base is the driver_location, which is already multiplied by 4, so is in dwords */
4133    unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
4134    /* component is in bytes */
4135    const_offset += nir_intrinsic_component(instr) * component_stride;
4136
4137    /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
4138    nir_src *off_src = nir_get_io_offset_src(instr);
4139    return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
4140 }
4141
4142 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
4143 {
4144    return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
4145 }
4146
4147 Temp get_tess_rel_patch_id(isel_context *ctx)
4148 {
4149    Builder bld(ctx->program, ctx->block);
4150
4151    switch (ctx->shader->info.stage) {
4152    case MESA_SHADER_TESS_CTRL:
4153       return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
4154                       get_arg(ctx, ctx->args->ac.tcs_rel_ids));
4155    case MESA_SHADER_TESS_EVAL:
4156       return get_arg(ctx, ctx->args->tes_rel_patch_id);
4157    default:
4158       unreachable("Unsupported stage in get_tess_rel_patch_id");
4159    }
4160 }
4161
4162 std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4163 {
4164    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4165    Builder bld(ctx->program, ctx->block);
4166
4167    uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
4168    uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
4169
4170    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
4171
4172    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4173    offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
4174
4175    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4176    Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
4177    offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
4178
4179    return offset_mul(ctx, offs, 4u);
4180 }
4181
4182 std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
4183 {
4184    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4185    Builder bld(ctx->program, ctx->block);
4186
4187    uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
4188    uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
4189    uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4190    uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
4191
4192    std::pair<Temp, unsigned> offs = instr
4193                                     ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
4194                                     : std::make_pair(Temp(), 0u);
4195
4196    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4197    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
4198
4199    if (per_vertex) {
4200       assert(instr);
4201
4202       nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4203       offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
4204
4205       uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
4206       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
4207    } else {
4208       uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
4209       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
4210    }
4211
4212    return offs;
4213 }
4214
4215 std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4216 {
4217    Builder bld(ctx->program, ctx->block);
4218
4219    unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
4220    unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
4221
4222    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
4223
4224    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4225    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
4226    offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
4227
4228    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4229    offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
4230
4231    return offs;
4232 }
4233
4234 std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
4235 {
4236    Builder bld(ctx->program, ctx->block);
4237
4238    unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
4239    unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4240    unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
4241    unsigned attr_stride = ctx->tcs_num_patches;
4242
4243    std::pair<Temp, unsigned> offs = instr
4244                                     ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
4245                                     : std::make_pair(Temp(), 0u);
4246
4247    if (const_base_offset)
4248       offs.second += const_base_offset * attr_stride;
4249
4250    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4251    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, 16u);
4252    offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
4253
4254    return offs;
4255 }
4256
4257 bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
4258 {
4259    assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4260
4261    if (mask == 0)
4262       return false;
4263
4264    unsigned drv_loc = nir_intrinsic_base(instr);
4265    nir_src *off_src = nir_get_io_offset_src(instr);
4266
4267    if (!nir_src_is_const(*off_src)) {
4268       *indirect = true;
4269       return false;
4270    }
4271
4272    *indirect = false;
4273    uint64_t slot = per_vertex
4274                    ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
4275                    : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
4276    return (((uint64_t) 1) << slot) & mask;
4277 }
4278
4279 bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
4280 {
4281    unsigned write_mask = nir_intrinsic_write_mask(instr);
4282    unsigned component = nir_intrinsic_component(instr);
4283    unsigned idx = nir_intrinsic_base(instr) + component;
4284
4285    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
4286    if (off_instr->type != nir_instr_type_load_const)
4287       return false;
4288
4289    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4290    idx += nir_src_as_uint(instr->src[1]) * 4u;
4291
4292    if (instr->src[0].ssa->bit_size == 64)
4293       write_mask = widen_mask(write_mask, 2);
4294
4295    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4296
4297    for (unsigned i = 0; i < 8; ++i) {
4298       if (write_mask & (1 << i)) {
4299          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4300          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4301       }
4302       idx++;
4303    }
4304
4305    return true;
4306 }
4307
4308 bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
4309 {
4310    /* Only TCS per-vertex inputs are supported by this function.
4311     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
4312     */
4313    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4314       return false;
4315
4316    nir_src *off_src = nir_get_io_offset_src(instr);
4317    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4318    nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
4319    bool can_use_temps = nir_src_is_const(*off_src) &&
4320                         vertex_index_instr->type == nir_instr_type_intrinsic &&
4321                         nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4322
4323    if (!can_use_temps)
4324       return false;
4325
4326    unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
4327    Temp *src = &ctx->inputs.temps[idx];
4328    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4329
4330    return true;
4331 }
4332
4333 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
4334 {
4335    Builder bld(ctx->program, ctx->block);
4336
4337    if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
4338       /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
4339       bool indirect_write;
4340       bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
4341       if (temp_only_input && !indirect_write)
4342          return;
4343    }
4344
4345    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
4346    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4347    unsigned write_mask = nir_intrinsic_write_mask(instr);
4348    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
4349
4350    if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
4351       /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
4352       Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
4353       Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
4354       store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
4355    } else {
4356       Temp lds_base;
4357
4358       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4359          /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
4360          unsigned itemsize = ctx->stage == vertex_geometry_gs
4361                              ? ctx->program->info->vs.es_info.esgs_itemsize
4362                              : ctx->program->info->tes.es_info.esgs_itemsize;
4363          Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
4364          Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
4365          Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
4366                                bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
4367          lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
4368       } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
4369          /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
4370           * GFX9+: LS is merged into HS, but still uses the same LDS layout.
4371           */
4372          Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
4373          lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
4374       } else {
4375          unreachable("Invalid LS or ES stage");
4376       }
4377
4378       offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
4379       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
4380       store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
4381    }
4382 }
4383
4384 bool tcs_output_is_tess_factor(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4385 {
4386    if (per_vertex)
4387       return false;
4388
4389    unsigned off = nir_intrinsic_base(instr) * 4u;
4390    return off == ctx->tcs_tess_lvl_out_loc ||
4391           off == ctx->tcs_tess_lvl_in_loc;
4392
4393 }
4394
4395 bool tcs_output_is_read_by_tes(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4396 {
4397    uint64_t mask = per_vertex
4398                    ? ctx->program->info->tcs.tes_inputs_read
4399                    : ctx->program->info->tcs.tes_patch_inputs_read;
4400
4401    bool indirect_write = false;
4402    bool output_read_by_tes = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4403    return indirect_write || output_read_by_tes;
4404 }
4405
4406 bool tcs_output_is_read_by_tcs(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4407 {
4408    uint64_t mask = per_vertex
4409                    ? ctx->shader->info.outputs_read
4410                    : ctx->shader->info.patch_outputs_read;
4411
4412    bool indirect_write = false;
4413    bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4414    return indirect_write || output_read;
4415 }
4416
4417 void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4418 {
4419    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4420    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4421
4422    Builder bld(ctx->program, ctx->block);
4423
4424    Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
4425    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4426    unsigned write_mask = nir_intrinsic_write_mask(instr);
4427
4428    bool is_tess_factor = tcs_output_is_tess_factor(ctx, instr, per_vertex);
4429    bool write_to_vmem = !is_tess_factor && tcs_output_is_read_by_tes(ctx, instr, per_vertex);
4430    bool write_to_lds = is_tess_factor || tcs_output_is_read_by_tcs(ctx, instr, per_vertex);
4431
4432    if (write_to_vmem) {
4433       std::pair<Temp, unsigned> vmem_offs = per_vertex
4434                                             ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
4435                                             : get_tcs_per_patch_output_vmem_offset(ctx, instr);
4436
4437       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4438       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
4439       store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
4440    }
4441
4442    if (write_to_lds) {
4443       std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4444       unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4445       store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
4446    }
4447 }
4448
4449 void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4450 {
4451    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4452    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4453
4454    Builder bld(ctx->program, ctx->block);
4455
4456    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4457    std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4458    unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4459    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4460
4461    load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
4462 }
4463
4464 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
4465 {
4466    if (ctx->stage == vertex_vs ||
4467        ctx->stage == tess_eval_vs ||
4468        ctx->stage == fragment_fs ||
4469        ctx->stage == ngg_vertex_gs ||
4470        ctx->stage == ngg_tess_eval_gs ||
4471        ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4472       bool stored_to_temps = store_output_to_temps(ctx, instr);
4473       if (!stored_to_temps) {
4474          fprintf(stderr, "Unimplemented output offset instruction:\n");
4475          nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
4476          fprintf(stderr, "\n");
4477          abort();
4478       }
4479    } else if (ctx->stage == vertex_es ||
4480               ctx->stage == vertex_ls ||
4481               ctx->stage == tess_eval_es ||
4482               (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4483               (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4484               (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) {
4485       visit_store_ls_or_es_output(ctx, instr);
4486    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
4487       visit_store_tcs_output(ctx, instr, false);
4488    } else {
4489       unreachable("Shader stage not implemented");
4490    }
4491 }
4492
4493 void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
4494 {
4495    visit_load_tcs_output(ctx, instr, false);
4496 }
4497
4498 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
4499 {
4500    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4501    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4502
4503    Builder bld(ctx->program, ctx->block);
4504
4505    if (dst.regClass() == v2b) {
4506       if (ctx->program->has_16bank_lds) {
4507          assert(ctx->options->chip_class <= GFX8);
4508          Builder::Result interp_p1 =
4509             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
4510                        Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
4511          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
4512                                 coord1, bld.m0(prim_mask), interp_p1, idx, component);
4513          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
4514                  bld.m0(prim_mask), interp_p1, idx, component);
4515       } else {
4516          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4517
4518          if (ctx->options->chip_class == GFX8)
4519             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4520
4521          Builder::Result interp_p1 =
4522             bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
4523                        coord1, bld.m0(prim_mask), idx, component);
4524          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
4525                     interp_p1, idx, component);
4526       }
4527    } else {
4528       Builder::Result interp_p1 =
4529          bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4530                     bld.m0(prim_mask), idx, component);
4531
4532       if (ctx->program->has_16bank_lds)
4533          interp_p1.instr->operands[0].setLateKill(true);
4534
4535       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
4536                  bld.m0(prim_mask), interp_p1, idx, component);
4537    }
4538 }
4539
4540 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
4541 {
4542    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4543    for (unsigned i = 0; i < num_components; i++)
4544       vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4545    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4546       assert(num_components == 4);
4547       Builder bld(ctx->program, ctx->block);
4548       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4549    }
4550
4551    for (Operand& op : vec->operands)
4552       op = op.isUndefined() ? Operand(0u) : op;
4553
4554    vec->definitions[0] = Definition(dst);
4555    ctx->block->instructions.emplace_back(std::move(vec));
4556    emit_split_vector(ctx, dst, num_components);
4557    return;
4558 }
4559
4560 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
4561 {
4562    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4563    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4564    unsigned idx = nir_intrinsic_base(instr);
4565    unsigned component = nir_intrinsic_component(instr);
4566    Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4567
4568    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
4569    if (offset) {
4570       assert(offset->u32 == 0);
4571    } else {
4572       /* the lower 15bit of the prim_mask contain the offset into LDS
4573        * while the upper bits contain the number of prims */
4574       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
4575       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4576       Builder bld(ctx->program, ctx->block);
4577       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4578       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4579       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4580       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4581       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4582    }
4583
4584    if (instr->dest.ssa.num_components == 1) {
4585       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4586    } else {
4587       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4588       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
4589       {
4590          Temp tmp = {ctx->program->allocateId(), v1};
4591          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
4592          vec->operands[i] = Operand(tmp);
4593       }
4594       vec->definitions[0] = Definition(dst);
4595       ctx->block->instructions.emplace_back(std::move(vec));
4596    }
4597 }
4598
4599 bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
4600                              unsigned offset, unsigned stride, unsigned channels)
4601 {
4602    unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4603    if (vtx_info->chan_byte_size != 4 && channels == 3)
4604       return false;
4605    return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
4606           (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
4607 }
4608
4609 uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
4610                               unsigned offset, unsigned stride, unsigned *channels)
4611 {
4612    if (!vtx_info->chan_byte_size) {
4613       *channels = vtx_info->num_channels;
4614       return vtx_info->chan_format;
4615    }
4616
4617    unsigned num_channels = *channels;
4618    if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
4619       unsigned new_channels = num_channels + 1;
4620       /* first, assume more loads is worse and try using a larger data format */
4621       while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
4622          new_channels++;
4623          /* don't make the attribute potentially out-of-bounds */
4624          if (offset + new_channels * vtx_info->chan_byte_size > stride)
4625             new_channels = 5;
4626       }
4627
4628       if (new_channels == 5) {
4629          /* then try decreasing load size (at the cost of more loads) */
4630          new_channels = *channels;
4631          while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
4632             new_channels--;
4633       }
4634
4635       if (new_channels < *channels)
4636          *channels = new_channels;
4637       num_channels = new_channels;
4638    }
4639
4640    switch (vtx_info->chan_format) {
4641    case V_008F0C_BUF_DATA_FORMAT_8:
4642       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4643                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4644    case V_008F0C_BUF_DATA_FORMAT_16:
4645       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4646                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4647    case V_008F0C_BUF_DATA_FORMAT_32:
4648       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4649                          V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4650    }
4651    unreachable("shouldn't reach here");
4652    return V_008F0C_BUF_DATA_FORMAT_INVALID;
4653 }
4654
4655 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4656  * so we may need to fix it up. */
4657 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
4658 {
4659    Builder bld(ctx->program, ctx->block);
4660
4661    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
4662       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4663
4664    /* For the integer-like cases, do a natural sign extension.
4665     *
4666     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4667     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4668     * exponent.
4669     */
4670    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
4671    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
4672
4673    /* Convert back to the right type. */
4674    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
4675       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4676       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
4677       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
4678    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
4679       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4680    }
4681
4682    return alpha;
4683 }
4684
4685 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
4686 {
4687    Builder bld(ctx->program, ctx->block);
4688    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4689    if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
4690
4691       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
4692       if (off_instr->type != nir_instr_type_load_const) {
4693          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4694          nir_print_instr(off_instr, stderr);
4695          fprintf(stderr, "\n");
4696       }
4697       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
4698
4699       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
4700
4701       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
4702       unsigned component = nir_intrinsic_component(instr);
4703       unsigned bitsize = instr->dest.ssa.bit_size;
4704       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
4705       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
4706       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
4707       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
4708
4709       unsigned dfmt = attrib_format & 0xf;
4710       unsigned nfmt = (attrib_format >> 4) & 0x7;
4711       const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
4712
4713       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
4714       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4715       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
4716       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
4717       if (post_shuffle)
4718          num_channels = MAX2(num_channels, 3);
4719
4720       Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u));
4721       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
4722
4723       Temp index;
4724       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
4725          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
4726          Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
4727          if (divisor) {
4728             Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
4729             if (divisor != 1) {
4730                Temp divided = bld.tmp(v1);
4731                emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
4732                index = bld.vadd32(bld.def(v1), start_instance, divided);
4733             } else {
4734                index = bld.vadd32(bld.def(v1), start_instance, instance_id);
4735             }
4736          } else {
4737             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
4738          }
4739       } else {
4740          index = bld.vadd32(bld.def(v1),
4741                             get_arg(ctx, ctx->args->ac.base_vertex),
4742                             get_arg(ctx, ctx->args->ac.vertex_id));
4743       }
4744
4745       Temp channels[num_channels];
4746       unsigned channel_start = 0;
4747       bool direct_fetch = false;
4748
4749       /* skip unused channels at the start */
4750       if (vtx_info->chan_byte_size && !post_shuffle) {
4751          channel_start = ffs(mask) - 1;
4752          for (unsigned i = 0; i < channel_start; i++)
4753             channels[i] = Temp(0, s1);
4754       } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
4755          num_channels = 3 - (ffs(mask) - 1);
4756       }
4757
4758       /* load channels */
4759       while (channel_start < num_channels) {
4760          unsigned fetch_component = num_channels - channel_start;
4761          unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
4762          bool expanded = false;
4763
4764          /* use MUBUF when possible to avoid possible alignment issues */
4765          /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
4766          bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
4767                            nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
4768                            nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
4769                           vtx_info->chan_byte_size == 4;
4770          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
4771          if (!use_mubuf) {
4772             fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
4773          } else {
4774             if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
4775                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
4776                fetch_component = 4;
4777                expanded = true;
4778             }
4779          }
4780
4781          unsigned fetch_bytes = fetch_component * bitsize / 8;
4782
4783          Temp fetch_index = index;
4784          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
4785             fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
4786             fetch_offset = fetch_offset % attrib_stride;
4787          }
4788
4789          Operand soffset(0u);
4790          if (fetch_offset >= 4096) {
4791             soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
4792             fetch_offset %= 4096;
4793          }
4794
4795          aco_opcode opcode;
4796          switch (fetch_bytes) {
4797          case 2:
4798             assert(!use_mubuf && bitsize == 16);
4799             opcode = aco_opcode::tbuffer_load_format_d16_x;
4800             break;
4801          case 4:
4802             if (bitsize == 16) {
4803                assert(!use_mubuf);
4804                opcode = aco_opcode::tbuffer_load_format_d16_xy;
4805             } else {
4806                opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
4807             }
4808             break;
4809          case 6:
4810             assert(!use_mubuf && bitsize == 16);
4811             opcode = aco_opcode::tbuffer_load_format_d16_xyz;
4812             break;
4813          case 8:
4814             if (bitsize == 16) {
4815                assert(!use_mubuf);
4816                opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
4817             } else {
4818                opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
4819             }
4820             break;
4821          case 12:
4822             assert(ctx->options->chip_class >= GFX7 ||
4823                    (!use_mubuf && ctx->options->chip_class == GFX6));
4824             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
4825             break;
4826          case 16:
4827             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
4828             break;
4829          default:
4830             unreachable("Unimplemented load_input vector size");
4831          }
4832
4833          Temp fetch_dst;
4834          if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
4835              !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
4836                            num_channels <= 3)) {
4837             direct_fetch = true;
4838             fetch_dst = dst;
4839          } else {
4840             fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
4841          }
4842
4843          if (use_mubuf) {
4844             Instruction *mubuf = bld.mubuf(opcode,
4845                                            Definition(fetch_dst), list, fetch_index, soffset,
4846                                            fetch_offset, false, false, true).instr;
4847             static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
4848          } else {
4849             Instruction *mtbuf = bld.mtbuf(opcode,
4850                                            Definition(fetch_dst), list, fetch_index, soffset,
4851                                            fetch_dfmt, nfmt, fetch_offset, false, true).instr;
4852             static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
4853          }
4854
4855          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
4856
4857          if (fetch_component == 1) {
4858             channels[channel_start] = fetch_dst;
4859          } else {
4860             for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
4861                channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
4862                                                                  bitsize == 16 ? v2b : v1);
4863          }
4864
4865          channel_start += fetch_component;
4866       }
4867
4868       if (!direct_fetch) {
4869          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
4870                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
4871
4872          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
4873          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
4874          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
4875
4876          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4877          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4878          unsigned num_temp = 0;
4879          for (unsigned i = 0; i < dst.size(); i++) {
4880             unsigned idx = i + component;
4881             if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
4882                Temp channel = channels[swizzle[idx]];
4883                if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
4884                   channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
4885                vec->operands[i] = Operand(channel);
4886
4887                num_temp++;
4888                elems[i] = channel;
4889             } else if (is_float && idx == 3) {
4890                vec->operands[i] = Operand(0x3f800000u);
4891             } else if (!is_float && idx == 3) {
4892                vec->operands[i] = Operand(1u);
4893             } else {
4894                vec->operands[i] = Operand(0u);
4895             }
4896          }
4897          vec->definitions[0] = Definition(dst);
4898          ctx->block->instructions.emplace_back(std::move(vec));
4899          emit_split_vector(ctx, dst, dst.size());
4900
4901          if (num_temp == dst.size())
4902             ctx->allocated_vec.emplace(dst.id(), elems);
4903       }
4904    } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
4905       unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
4906       nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
4907       if (off_instr->type != nir_instr_type_load_const ||
4908           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
4909          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4910          nir_print_instr(off_instr, stderr);
4911          fprintf(stderr, "\n");
4912       }
4913
4914       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4915       nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]);
4916       if (offset) {
4917          assert(offset->u32 == 0);
4918       } else {
4919          /* the lower 15bit of the prim_mask contain the offset into LDS
4920           * while the upper bits contain the number of prims */
4921          Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa);
4922          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4923          Builder bld(ctx->program, ctx->block);
4924          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4925          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4926          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4927          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4928          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4929       }
4930
4931       unsigned idx = nir_intrinsic_base(instr);
4932       unsigned component = nir_intrinsic_component(instr);
4933       unsigned vertex_id = 2; /* P0 */
4934
4935       if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
4936          nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
4937          switch (src0->u32) {
4938          case 0:
4939             vertex_id = 2; /* P0 */
4940             break;
4941          case 1:
4942             vertex_id = 0; /* P10 */
4943             break;
4944          case 2:
4945             vertex_id = 1; /* P20 */
4946             break;
4947          default:
4948             unreachable("invalid vertex index");
4949          }
4950       }
4951
4952       if (dst.size() == 1) {
4953          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
4954       } else {
4955          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4956          for (unsigned i = 0; i < dst.size(); i++)
4957             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
4958          vec->definitions[0] = Definition(dst);
4959          bld.insert(std::move(vec));
4960       }
4961
4962    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_EVAL) {
4963       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4964       Temp soffset = get_arg(ctx, ctx->args->oc_lds);
4965       std::pair<Temp, unsigned> offs = get_tcs_per_patch_output_vmem_offset(ctx, instr);
4966       unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8u;
4967
4968       load_vmem_mubuf(ctx, dst, ring, offs.first, soffset, offs.second, elem_size_bytes, instr->dest.ssa.num_components);
4969    } else {
4970       unreachable("Shader stage not implemented");
4971    }
4972 }
4973
4974 std::pair<Temp, unsigned> get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u)
4975 {
4976    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
4977
4978    Builder bld(ctx->program, ctx->block);
4979    nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
4980    Temp vertex_offset;
4981
4982    if (!nir_src_is_const(*vertex_src)) {
4983       /* better code could be created, but this case probably doesn't happen
4984        * much in practice */
4985       Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa));
4986       for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
4987          Temp elem;
4988
4989          if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4990             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
4991             if (i % 2u)
4992                elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
4993          } else {
4994             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]);
4995          }
4996
4997          if (vertex_offset.id()) {
4998             Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)),
4999                                  Operand(i), indirect_vertex);
5000             vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond);
5001          } else {
5002             vertex_offset = elem;
5003          }
5004       }
5005
5006       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
5007          vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset);
5008    } else {
5009       unsigned vertex = nir_src_as_uint(*vertex_src);
5010       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
5011          vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
5012                                   get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
5013                                   Operand((vertex % 2u) * 16u), Operand(16u));
5014       else
5015          vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]);
5016    }
5017
5018    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride);
5019    offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u));
5020    return offset_mul(ctx, offs, 4u);
5021 }
5022
5023 void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5024 {
5025    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
5026
5027    Builder bld(ctx->program, ctx->block);
5028    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5029    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5030
5031    if (ctx->stage == geometry_gs) {
5032       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size);
5033       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u));
5034       load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true);
5035    } else if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
5036       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr);
5037       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
5038       load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
5039    } else {
5040       unreachable("Unsupported GS stage.");
5041    }
5042 }
5043
5044 void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5045 {
5046    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5047
5048    Builder bld(ctx->program, ctx->block);
5049    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5050
5051    if (load_input_from_temps(ctx, instr, dst))
5052       return;
5053
5054    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
5055    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5056    unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
5057
5058    load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
5059 }
5060
5061 void visit_load_tes_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5062 {
5063    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5064
5065    Builder bld(ctx->program, ctx->block);
5066
5067    Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
5068    Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
5069    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5070
5071    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5072    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_output_vmem_offset(ctx, instr);
5073
5074    load_vmem_mubuf(ctx, dst, ring, offs.first, oc_lds, offs.second, elem_size_bytes, instr->dest.ssa.num_components, 0u, true, true);
5075 }
5076
5077 void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5078 {
5079    switch (ctx->shader->info.stage) {
5080    case MESA_SHADER_GEOMETRY:
5081       visit_load_gs_per_vertex_input(ctx, instr);
5082       break;
5083    case MESA_SHADER_TESS_CTRL:
5084       visit_load_tcs_per_vertex_input(ctx, instr);
5085       break;
5086    case MESA_SHADER_TESS_EVAL:
5087       visit_load_tes_per_vertex_input(ctx, instr);
5088       break;
5089    default:
5090       unreachable("Unimplemented shader stage");
5091    }
5092 }
5093
5094 void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5095 {
5096    visit_load_tcs_output(ctx, instr, true);
5097 }
5098
5099 void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5100 {
5101    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
5102    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5103
5104    visit_store_tcs_output(ctx, instr, true);
5105 }
5106
5107 void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
5108 {
5109    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5110
5111    Builder bld(ctx->program, ctx->block);
5112    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5113
5114    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5115    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5116    Operand tes_w(0u);
5117
5118    if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5119       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5120       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp);
5121       tes_w = Operand(tmp);
5122    }
5123
5124    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5125    emit_split_vector(ctx, tess_coord, 3);
5126 }
5127
5128 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
5129 {
5130    if (ctx->program->info->need_indirect_descriptor_sets) {
5131       Builder bld(ctx->program, ctx->block);
5132       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5133       Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
5134       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
5135    }
5136
5137    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5138 }
5139
5140
5141 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
5142 {
5143    Builder bld(ctx->program, ctx->block);
5144    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5145    if (!nir_dest_is_divergent(instr->dest))
5146       index = bld.as_uniform(index);
5147    unsigned desc_set = nir_intrinsic_desc_set(instr);
5148    unsigned binding = nir_intrinsic_binding(instr);
5149
5150    Temp desc_ptr;
5151    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
5152    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
5153    unsigned offset = layout->binding[binding].offset;
5154    unsigned stride;
5155    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5156        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5157       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
5158       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5159       offset = pipeline_layout->push_constant_size + 16 * idx;
5160       stride = 16;
5161    } else {
5162       desc_ptr = load_desc_ptr(ctx, desc_set);
5163       stride = layout->binding[binding].size;
5164    }
5165
5166    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
5167    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
5168    if (stride != 1) {
5169       if (nir_const_index) {
5170          const_index = const_index * stride;
5171       } else if (index.type() == RegType::vgpr) {
5172          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5173          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5174       } else {
5175          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
5176       }
5177    }
5178    if (offset) {
5179       if (nir_const_index) {
5180          const_index = const_index + offset;
5181       } else if (index.type() == RegType::vgpr) {
5182          index = bld.vadd32(bld.def(v1), Operand(offset), index);
5183       } else {
5184          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
5185       }
5186    }
5187
5188    if (nir_const_index && const_index == 0) {
5189       index = desc_ptr;
5190    } else if (index.type() == RegType::vgpr) {
5191       index = bld.vadd32(bld.def(v1),
5192                          nir_const_index ? Operand(const_index) : Operand(index),
5193                          Operand(desc_ptr));
5194    } else {
5195       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5196                        nir_const_index ? Operand(const_index) : Operand(index),
5197                        Operand(desc_ptr));
5198    }
5199
5200    bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
5201 }
5202
5203 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
5204                  Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5205                  bool glc=false, bool readonly=true, bool allow_smem=true)
5206 {
5207    Builder bld(ctx->program, ctx->block);
5208
5209    bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5210    if (use_smem)
5211       offset = bld.as_uniform(offset);
5212
5213    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5214    info.glc = glc;
5215    info.barrier = readonly ? barrier_none : barrier_buffer;
5216    info.can_reorder = readonly;
5217    info.align_mul = align_mul;
5218    info.align_offset = align_offset;
5219    if (use_smem)
5220       emit_smem_load(ctx, bld, &info);
5221    else
5222       emit_mubuf_load(ctx, bld, &info);
5223 }
5224
5225 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
5226 {
5227    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5228    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5229
5230    Builder bld(ctx->program, ctx->block);
5231
5232    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
5233    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
5234    unsigned binding = nir_intrinsic_binding(idx_instr);
5235    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
5236
5237    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
5238       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5239                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5240                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5241                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5242       if (ctx->options->chip_class >= GFX10) {
5243          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5244                       S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5245                       S_008F0C_RESOURCE_LEVEL(1);
5246       } else {
5247          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5248                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5249       }
5250       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
5251                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5252                                      Operand(0xFFFFFFFFu),
5253                                      Operand(desc_type));
5254       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5255                         rsrc, upper_dwords);
5256    } else {
5257       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
5258       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5259    }
5260    unsigned size = instr->dest.ssa.bit_size / 8;
5261    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5262                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5263 }
5264
5265 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5266 {
5267    Builder bld(ctx->program, ctx->block);
5268    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5269    unsigned offset = nir_intrinsic_base(instr);
5270    unsigned count = instr->dest.ssa.num_components;
5271    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
5272
5273    if (index_cv && instr->dest.ssa.bit_size == 32) {
5274       unsigned start = (offset + index_cv->u32) / 4u;
5275       start -= ctx->args->ac.base_inline_push_consts;
5276       if (start + count <= ctx->args->ac.num_inline_push_consts) {
5277          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
5278          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5279          for (unsigned i = 0; i < count; ++i) {
5280             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5281             vec->operands[i] = Operand{elems[i]};
5282          }
5283          vec->definitions[0] = Definition(dst);
5284          ctx->block->instructions.emplace_back(std::move(vec));
5285          ctx->allocated_vec.emplace(dst.id(), elems);
5286          return;
5287       }
5288    }
5289
5290    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5291    if (offset != 0) // TODO check if index != 0 as well
5292       index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
5293    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5294    Temp vec = dst;
5295    bool trim = false;
5296    bool aligned = true;
5297
5298    if (instr->dest.ssa.bit_size == 8) {
5299       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5300       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5301       if (!aligned)
5302          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5303    } else if (instr->dest.ssa.bit_size == 16) {
5304       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5305       if (!aligned)
5306          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5307    }
5308
5309    aco_opcode op;
5310
5311    switch (vec.size()) {
5312    case 1:
5313       op = aco_opcode::s_load_dword;
5314       break;
5315    case 2:
5316       op = aco_opcode::s_load_dwordx2;
5317       break;
5318    case 3:
5319       vec = bld.tmp(s4);
5320       trim = true;
5321    case 4:
5322       op = aco_opcode::s_load_dwordx4;
5323       break;
5324    case 6:
5325       vec = bld.tmp(s8);
5326       trim = true;
5327    case 8:
5328       op = aco_opcode::s_load_dwordx8;
5329       break;
5330    default:
5331       unreachable("unimplemented or forbidden load_push_constant.");
5332    }
5333
5334    static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
5335
5336    if (!aligned) {
5337       Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
5338       byte_align_scalar(ctx, vec, byte_offset, dst);
5339       return;
5340    }
5341
5342    if (trim) {
5343       emit_split_vector(ctx, vec, 4);
5344       RegClass rc = dst.size() == 3 ? s1 : s2;
5345       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5346                  emit_extract_vector(ctx, vec, 0, rc),
5347                  emit_extract_vector(ctx, vec, 1, rc),
5348                  emit_extract_vector(ctx, vec, 2, rc));
5349
5350    }
5351    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5352 }
5353
5354 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5355 {
5356    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5357
5358    Builder bld(ctx->program, ctx->block);
5359
5360    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5361                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5362                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5363                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5364    if (ctx->options->chip_class >= GFX10) {
5365       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5366                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5367                    S_008F0C_RESOURCE_LEVEL(1);
5368    } else {
5369       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5370                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5371    }
5372
5373    unsigned base = nir_intrinsic_base(instr);
5374    unsigned range = nir_intrinsic_range(instr);
5375
5376    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5377    if (base && offset.type() == RegType::sgpr)
5378       offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
5379    else if (base && offset.type() == RegType::vgpr)
5380       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
5381
5382    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5383                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
5384                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
5385                           Operand(desc_type));
5386    unsigned size = instr->dest.ssa.bit_size / 8;
5387    // TODO: get alignment information for subdword constants
5388    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5389 }
5390
5391 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
5392 {
5393    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5394       ctx->cf_info.exec_potentially_empty_discard = true;
5395
5396    ctx->program->needs_exact = true;
5397
5398    // TODO: optimize uniform conditions
5399    Builder bld(ctx->program, ctx->block);
5400    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5401    assert(src.regClass() == bld.lm);
5402    src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5403    bld.pseudo(aco_opcode::p_discard_if, src);
5404    ctx->block->kind |= block_kind_uses_discard_if;
5405    return;
5406 }
5407
5408 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
5409 {
5410    Builder bld(ctx->program, ctx->block);
5411
5412    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5413       ctx->cf_info.exec_potentially_empty_discard = true;
5414
5415    bool divergent = ctx->cf_info.parent_if.is_divergent ||
5416                     ctx->cf_info.parent_loop.has_divergent_continue;
5417
5418    if (ctx->block->loop_nest_depth &&
5419        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
5420       /* we handle discards the same way as jump instructions */
5421       append_logical_end(ctx->block);
5422
5423       /* in loops, discard behaves like break */
5424       Block *linear_target = ctx->cf_info.parent_loop.exit;
5425       ctx->block->kind |= block_kind_discard;
5426
5427       if (!divergent) {
5428          /* uniform discard - loop ends here */
5429          assert(nir_instr_is_last(&instr->instr));
5430          ctx->block->kind |= block_kind_uniform;
5431          ctx->cf_info.has_branch = true;
5432          bld.branch(aco_opcode::p_branch);
5433          add_linear_edge(ctx->block->index, linear_target);
5434          return;
5435       }
5436
5437       /* we add a break right behind the discard() instructions */
5438       ctx->block->kind |= block_kind_break;
5439       unsigned idx = ctx->block->index;
5440
5441       ctx->cf_info.parent_loop.has_divergent_branch = true;
5442       ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx;
5443
5444       /* remove critical edges from linear CFG */
5445       bld.branch(aco_opcode::p_branch);
5446       Block* break_block = ctx->program->create_and_insert_block();
5447       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5448       break_block->kind |= block_kind_uniform;
5449       add_linear_edge(idx, break_block);
5450       add_linear_edge(break_block->index, linear_target);
5451       bld.reset(break_block);
5452       bld.branch(aco_opcode::p_branch);
5453
5454       Block* continue_block = ctx->program->create_and_insert_block();
5455       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5456       add_linear_edge(idx, continue_block);
5457       append_logical_start(continue_block);
5458       ctx->block = continue_block;
5459
5460       return;
5461    }
5462
5463    /* it can currently happen that NIR doesn't remove the unreachable code */
5464    if (!nir_instr_is_last(&instr->instr)) {
5465       ctx->program->needs_exact = true;
5466       /* save exec somewhere temporarily so that it doesn't get
5467        * overwritten before the discard from outer exec masks */
5468       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
5469       bld.pseudo(aco_opcode::p_discard_if, cond);
5470       ctx->block->kind |= block_kind_uses_discard_if;
5471       return;
5472    }
5473
5474    /* This condition is incorrect for uniformly branched discards in a loop
5475     * predicated by a divergent condition, but the above code catches that case
5476     * and the discard would end up turning into a discard_if.
5477     * For example:
5478     * if (divergent) {
5479     *    while (...) {
5480     *       if (uniform) {
5481     *          discard;
5482     *       }
5483     *    }
5484     * }
5485     */
5486    if (!ctx->cf_info.parent_if.is_divergent) {
5487       /* program just ends here */
5488       ctx->block->kind |= block_kind_uniform;
5489       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
5490               0 /* enabled mask */, 9 /* dest */,
5491               false /* compressed */, true/* done */, true /* valid mask */);
5492       bld.sopp(aco_opcode::s_endpgm);
5493       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5494    } else {
5495       ctx->block->kind |= block_kind_discard;
5496       /* branch and linear edge is added by visit_if() */
5497    }
5498 }
5499
5500 enum aco_descriptor_type {
5501    ACO_DESC_IMAGE,
5502    ACO_DESC_FMASK,
5503    ACO_DESC_SAMPLER,
5504    ACO_DESC_BUFFER,
5505    ACO_DESC_PLANE_0,
5506    ACO_DESC_PLANE_1,
5507    ACO_DESC_PLANE_2,
5508 };
5509
5510 static bool
5511 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
5512    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5513       return false;
5514    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5515    return dim == ac_image_cube ||
5516           dim == ac_image_1darray ||
5517           dim == ac_image_2darray ||
5518           dim == ac_image_2darraymsaa;
5519 }
5520
5521 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
5522                       enum aco_descriptor_type desc_type,
5523                       const nir_tex_instr *tex_instr, bool image, bool write)
5524 {
5525 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5526    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
5527    if (it != ctx->tex_desc.end())
5528       return it->second;
5529 */
5530    Temp index = Temp();
5531    bool index_set = false;
5532    unsigned constant_index = 0;
5533    unsigned descriptor_set;
5534    unsigned base_index;
5535    Builder bld(ctx->program, ctx->block);
5536
5537    if (!deref_instr) {
5538       assert(tex_instr && !image);
5539       descriptor_set = 0;
5540       base_index = tex_instr->sampler_index;
5541    } else {
5542       while(deref_instr->deref_type != nir_deref_type_var) {
5543          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5544          if (!array_size)
5545             array_size = 1;
5546
5547          assert(deref_instr->deref_type == nir_deref_type_array);
5548          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
5549          if (const_value) {
5550             constant_index += array_size * const_value->u32;
5551          } else {
5552             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5553             if (indirect.type() == RegType::vgpr)
5554                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
5555
5556             if (array_size != 1)
5557                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
5558
5559             if (!index_set) {
5560                index = indirect;
5561                index_set = true;
5562             } else {
5563                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5564             }
5565          }
5566
5567          deref_instr = nir_src_as_deref(deref_instr->parent);
5568       }
5569       descriptor_set = deref_instr->var->data.descriptor_set;
5570       base_index = deref_instr->var->data.binding;
5571    }
5572
5573    Temp list = load_desc_ptr(ctx, descriptor_set);
5574    list = convert_pointer_to_64_bit(ctx, list);
5575
5576    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
5577    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
5578    unsigned offset = binding->offset;
5579    unsigned stride = binding->size;
5580    aco_opcode opcode;
5581    RegClass type;
5582
5583    assert(base_index < layout->binding_count);
5584
5585    switch (desc_type) {
5586    case ACO_DESC_IMAGE:
5587       type = s8;
5588       opcode = aco_opcode::s_load_dwordx8;
5589       break;
5590    case ACO_DESC_FMASK:
5591       type = s8;
5592       opcode = aco_opcode::s_load_dwordx8;
5593       offset += 32;
5594       break;
5595    case ACO_DESC_SAMPLER:
5596       type = s4;
5597       opcode = aco_opcode::s_load_dwordx4;
5598       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5599          offset += radv_combined_image_descriptor_sampler_offset(binding);
5600       break;
5601    case ACO_DESC_BUFFER:
5602       type = s4;
5603       opcode = aco_opcode::s_load_dwordx4;
5604       break;
5605    case ACO_DESC_PLANE_0:
5606    case ACO_DESC_PLANE_1:
5607       type = s8;
5608       opcode = aco_opcode::s_load_dwordx8;
5609       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5610       break;
5611    case ACO_DESC_PLANE_2:
5612       type = s4;
5613       opcode = aco_opcode::s_load_dwordx4;
5614       offset += 64;
5615       break;
5616    default:
5617       unreachable("invalid desc_type\n");
5618    }
5619
5620    offset += constant_index * stride;
5621
5622    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5623       (!index_set || binding->immutable_samplers_equal)) {
5624       if (binding->immutable_samplers_equal)
5625          constant_index = 0;
5626
5627       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
5628       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5629                         Operand(samplers[constant_index * 4 + 0]),
5630                         Operand(samplers[constant_index * 4 + 1]),
5631                         Operand(samplers[constant_index * 4 + 2]),
5632                         Operand(samplers[constant_index * 4 + 3]));
5633    }
5634
5635    Operand off;
5636    if (!index_set) {
5637       off = bld.copy(bld.def(s1), Operand(offset));
5638    } else {
5639       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
5640                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
5641    }
5642
5643    Temp res = bld.smem(opcode, bld.def(type), list, off);
5644
5645    if (desc_type == ACO_DESC_PLANE_2) {
5646       Temp components[8];
5647       for (unsigned i = 0; i < 8; i++)
5648          components[i] = bld.tmp(s1);
5649       bld.pseudo(aco_opcode::p_split_vector,
5650                  Definition(components[0]),
5651                  Definition(components[1]),
5652                  Definition(components[2]),
5653                  Definition(components[3]),
5654                  res);
5655
5656       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
5657       bld.pseudo(aco_opcode::p_split_vector,
5658                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5659                  Definition(components[4]),
5660                  Definition(components[5]),
5661                  Definition(components[6]),
5662                  Definition(components[7]),
5663                  desc2);
5664
5665       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
5666                        components[0], components[1], components[2], components[3],
5667                        components[4], components[5], components[6], components[7]);
5668    }
5669
5670    return res;
5671 }
5672
5673 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5674 {
5675    switch (dim) {
5676    case GLSL_SAMPLER_DIM_BUF:
5677       return 1;
5678    case GLSL_SAMPLER_DIM_1D:
5679       return array ? 2 : 1;
5680    case GLSL_SAMPLER_DIM_2D:
5681       return array ? 3 : 2;
5682    case GLSL_SAMPLER_DIM_MS:
5683       return array ? 4 : 3;
5684    case GLSL_SAMPLER_DIM_3D:
5685    case GLSL_SAMPLER_DIM_CUBE:
5686       return 3;
5687    case GLSL_SAMPLER_DIM_RECT:
5688    case GLSL_SAMPLER_DIM_SUBPASS:
5689       return 2;
5690    case GLSL_SAMPLER_DIM_SUBPASS_MS:
5691       return 3;
5692    default:
5693       break;
5694    }
5695    return 0;
5696 }
5697
5698
5699 /* Adjust the sample index according to FMASK.
5700  *
5701  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
5702  * which is the identity mapping. Each nibble says which physical sample
5703  * should be fetched to get that sample.
5704  *
5705  * For example, 0x11111100 means there are only 2 samples stored and
5706  * the second sample covers 3/4 of the pixel. When reading samples 0
5707  * and 1, return physical sample 0 (determined by the first two 0s
5708  * in FMASK), otherwise return physical sample 1.
5709  *
5710  * The sample index should be adjusted as follows:
5711  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
5712  */
5713 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
5714 {
5715    Builder bld(ctx->program, ctx->block);
5716    Temp fmask = bld.tmp(v1);
5717    unsigned dim = ctx->options->chip_class >= GFX10
5718                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
5719                   : 0;
5720
5721    Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) :
5722                      bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]);
5723    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
5724    load->operands[0] = Operand(fmask_desc_ptr);
5725    load->operands[1] = Operand(s4); /* no sampler */
5726    load->operands[2] = Operand(coord);
5727    load->definitions[0] = Definition(fmask);
5728    load->glc = false;
5729    load->dlc = false;
5730    load->dmask = 0x1;
5731    load->unrm = true;
5732    load->da = da;
5733    load->dim = dim;
5734    load->can_reorder = true; /* fmask images shouldn't be modified */
5735    ctx->block->instructions.emplace_back(std::move(load));
5736
5737    Operand sample_index4;
5738    if (sample_index.isConstant()) {
5739       if (sample_index.constantValue() < 16) {
5740          sample_index4 = Operand(sample_index.constantValue() << 2);
5741       } else {
5742          sample_index4 = Operand(0u);
5743       }
5744    } else if (sample_index.regClass() == s1) {
5745       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
5746    } else {
5747       assert(sample_index.regClass() == v1);
5748       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
5749    }
5750
5751    Temp final_sample;
5752    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
5753       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
5754    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
5755       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
5756    else
5757       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
5758
5759    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
5760     * resource descriptor is 0 (invalid),
5761     */
5762    Temp compare = bld.tmp(bld.lm);
5763    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
5764                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
5765
5766    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
5767
5768    /* Replace the MSAA sample index. */
5769    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
5770 }
5771
5772 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
5773 {
5774
5775    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
5776    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5777    bool is_array = glsl_sampler_type_is_array(type);
5778    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5779    assert(!add_frag_pos && "Input attachments should be lowered.");
5780    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5781    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
5782    int count = image_type_to_components_count(dim, is_array);
5783    std::vector<Temp> coords(count);
5784    Builder bld(ctx->program, ctx->block);
5785
5786    if (is_ms) {
5787       count--;
5788       Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
5789       /* get sample index */
5790       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
5791          nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
5792          Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
5793          std::vector<Temp> fmask_load_address;
5794          for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
5795             fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
5796
5797          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
5798          coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
5799       } else {
5800          coords[count] = emit_extract_vector(ctx, src2, 0, v1);
5801       }
5802    }
5803
5804    if (gfx9_1d) {
5805       coords[0] = emit_extract_vector(ctx, src0, 0, v1);
5806       coords.resize(coords.size() + 1);
5807       coords[1] = bld.copy(bld.def(v1), Operand(0u));
5808       if (is_array)
5809          coords[2] = emit_extract_vector(ctx, src0, 1, v1);
5810    } else {
5811       for (int i = 0; i < count; i++)
5812          coords[i] = emit_extract_vector(ctx, src0, i, v1);
5813    }
5814
5815    if (instr->intrinsic == nir_intrinsic_image_deref_load ||
5816        instr->intrinsic == nir_intrinsic_image_deref_store) {
5817       int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
5818       bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
5819
5820       if (!level_zero)
5821          coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
5822    }
5823
5824    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5825    for (unsigned i = 0; i < coords.size(); i++)
5826       vec->operands[i] = Operand(coords[i]);
5827    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
5828    vec->definitions[0] = Definition(res);
5829    ctx->block->instructions.emplace_back(std::move(vec));
5830    return res;
5831 }
5832
5833
5834 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
5835 {
5836    Builder bld(ctx->program, ctx->block);
5837    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5838    const struct glsl_type *type = glsl_without_array(var->type);
5839    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5840    bool is_array = glsl_sampler_type_is_array(type);
5841    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5842
5843    if (dim == GLSL_SAMPLER_DIM_BUF) {
5844       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
5845       unsigned num_channels = util_last_bit(mask);
5846       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5847       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5848
5849       aco_opcode opcode;
5850       switch (num_channels) {
5851       case 1:
5852          opcode = aco_opcode::buffer_load_format_x;
5853          break;
5854       case 2:
5855          opcode = aco_opcode::buffer_load_format_xy;
5856          break;
5857       case 3:
5858          opcode = aco_opcode::buffer_load_format_xyz;
5859          break;
5860       case 4:
5861          opcode = aco_opcode::buffer_load_format_xyzw;
5862          break;
5863       default:
5864          unreachable(">4 channel buffer image load");
5865       }
5866       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
5867       load->operands[0] = Operand(rsrc);
5868       load->operands[1] = Operand(vindex);
5869       load->operands[2] = Operand((uint32_t) 0);
5870       Temp tmp;
5871       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5872          tmp = dst;
5873       else
5874          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
5875       load->definitions[0] = Definition(tmp);
5876       load->idxen = true;
5877       load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
5878       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5879       load->barrier = barrier_image;
5880       ctx->block->instructions.emplace_back(std::move(load));
5881
5882       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
5883       return;
5884    }
5885
5886    Temp coords = get_image_coords(ctx, instr, type);
5887    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5888
5889    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
5890    unsigned num_components = util_bitcount(dmask);
5891    Temp tmp;
5892    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5893       tmp = dst;
5894    else
5895       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
5896
5897    bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
5898    aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
5899
5900    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
5901    load->operands[0] = Operand(resource);
5902    load->operands[1] = Operand(s4); /* no sampler */
5903    load->operands[2] = Operand(coords);
5904    load->definitions[0] = Definition(tmp);
5905    load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
5906    load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5907    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5908    load->dmask = dmask;
5909    load->unrm = true;
5910    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5911    load->barrier = barrier_image;
5912    ctx->block->instructions.emplace_back(std::move(load));
5913
5914    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
5915    return;
5916 }
5917
5918 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
5919 {
5920    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5921    const struct glsl_type *type = glsl_without_array(var->type);
5922    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5923    bool is_array = glsl_sampler_type_is_array(type);
5924    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5925
5926    bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
5927
5928    if (dim == GLSL_SAMPLER_DIM_BUF) {
5929       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5930       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5931       aco_opcode opcode;
5932       switch (data.size()) {
5933       case 1:
5934          opcode = aco_opcode::buffer_store_format_x;
5935          break;
5936       case 2:
5937          opcode = aco_opcode::buffer_store_format_xy;
5938          break;
5939       case 3:
5940          opcode = aco_opcode::buffer_store_format_xyz;
5941          break;
5942       case 4:
5943          opcode = aco_opcode::buffer_store_format_xyzw;
5944          break;
5945       default:
5946          unreachable(">4 channel buffer image store");
5947       }
5948       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
5949       store->operands[0] = Operand(rsrc);
5950       store->operands[1] = Operand(vindex);
5951       store->operands[2] = Operand((uint32_t) 0);
5952       store->operands[3] = Operand(data);
5953       store->idxen = true;
5954       store->glc = glc;
5955       store->dlc = false;
5956       store->disable_wqm = true;
5957       store->barrier = barrier_image;
5958       ctx->program->needs_exact = true;
5959       ctx->block->instructions.emplace_back(std::move(store));
5960       return;
5961    }
5962
5963    assert(data.type() == RegType::vgpr);
5964    Temp coords = get_image_coords(ctx, instr, type);
5965    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5966
5967    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
5968    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
5969
5970    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
5971    store->operands[0] = Operand(resource);
5972    store->operands[1] = Operand(data);
5973    store->operands[2] = Operand(coords);
5974    store->glc = glc;
5975    store->dlc = false;
5976    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5977    store->dmask = (1 << data.size()) - 1;
5978    store->unrm = true;
5979    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5980    store->disable_wqm = true;
5981    store->barrier = barrier_image;
5982    ctx->program->needs_exact = true;
5983    ctx->block->instructions.emplace_back(std::move(store));
5984    return;
5985 }
5986
5987 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
5988 {
5989    /* return the previous value if dest is ever used */
5990    bool return_previous = false;
5991    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5992       return_previous = true;
5993       break;
5994    }
5995    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5996       return_previous = true;
5997       break;
5998    }
5999
6000    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6001    const struct glsl_type *type = glsl_without_array(var->type);
6002    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6003    bool is_array = glsl_sampler_type_is_array(type);
6004    Builder bld(ctx->program, ctx->block);
6005
6006    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
6007    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
6008
6009    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
6010       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
6011
6012    aco_opcode buf_op, image_op;
6013    switch (instr->intrinsic) {
6014       case nir_intrinsic_image_deref_atomic_add:
6015          buf_op = aco_opcode::buffer_atomic_add;
6016          image_op = aco_opcode::image_atomic_add;
6017          break;
6018       case nir_intrinsic_image_deref_atomic_umin:
6019          buf_op = aco_opcode::buffer_atomic_umin;
6020          image_op = aco_opcode::image_atomic_umin;
6021          break;
6022       case nir_intrinsic_image_deref_atomic_imin:
6023          buf_op = aco_opcode::buffer_atomic_smin;
6024          image_op = aco_opcode::image_atomic_smin;
6025          break;
6026       case nir_intrinsic_image_deref_atomic_umax:
6027          buf_op = aco_opcode::buffer_atomic_umax;
6028          image_op = aco_opcode::image_atomic_umax;
6029          break;
6030       case nir_intrinsic_image_deref_atomic_imax:
6031          buf_op = aco_opcode::buffer_atomic_smax;
6032          image_op = aco_opcode::image_atomic_smax;
6033          break;
6034       case nir_intrinsic_image_deref_atomic_and:
6035          buf_op = aco_opcode::buffer_atomic_and;
6036          image_op = aco_opcode::image_atomic_and;
6037          break;
6038       case nir_intrinsic_image_deref_atomic_or:
6039          buf_op = aco_opcode::buffer_atomic_or;
6040          image_op = aco_opcode::image_atomic_or;
6041          break;
6042       case nir_intrinsic_image_deref_atomic_xor:
6043          buf_op = aco_opcode::buffer_atomic_xor;
6044          image_op = aco_opcode::image_atomic_xor;
6045          break;
6046       case nir_intrinsic_image_deref_atomic_exchange:
6047          buf_op = aco_opcode::buffer_atomic_swap;
6048          image_op = aco_opcode::image_atomic_swap;
6049          break;
6050       case nir_intrinsic_image_deref_atomic_comp_swap:
6051          buf_op = aco_opcode::buffer_atomic_cmpswap;
6052          image_op = aco_opcode::image_atomic_cmpswap;
6053          break;
6054       default:
6055          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
6056    }
6057
6058    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6059
6060    if (dim == GLSL_SAMPLER_DIM_BUF) {
6061       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6062       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
6063       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
6064       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6065       mubuf->operands[0] = Operand(resource);
6066       mubuf->operands[1] = Operand(vindex);
6067       mubuf->operands[2] = Operand((uint32_t)0);
6068       mubuf->operands[3] = Operand(data);
6069       if (return_previous)
6070          mubuf->definitions[0] = Definition(dst);
6071       mubuf->offset = 0;
6072       mubuf->idxen = true;
6073       mubuf->glc = return_previous;
6074       mubuf->dlc = false; /* Not needed for atomics */
6075       mubuf->disable_wqm = true;
6076       mubuf->barrier = barrier_image;
6077       ctx->program->needs_exact = true;
6078       ctx->block->instructions.emplace_back(std::move(mubuf));
6079       return;
6080    }
6081
6082    Temp coords = get_image_coords(ctx, instr, type);
6083    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
6084    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
6085    mimg->operands[0] = Operand(resource);
6086    mimg->operands[1] = Operand(data);
6087    mimg->operands[2] = Operand(coords);
6088    if (return_previous)
6089       mimg->definitions[0] = Definition(dst);
6090    mimg->glc = return_previous;
6091    mimg->dlc = false; /* Not needed for atomics */
6092    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6093    mimg->dmask = (1 << data.size()) - 1;
6094    mimg->unrm = true;
6095    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6096    mimg->disable_wqm = true;
6097    mimg->barrier = barrier_image;
6098    ctx->program->needs_exact = true;
6099    ctx->block->instructions.emplace_back(std::move(mimg));
6100    return;
6101 }
6102
6103 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
6104 {
6105    if (in_elements && ctx->options->chip_class == GFX8) {
6106       /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6107       Builder bld(ctx->program, ctx->block);
6108
6109       Temp size = emit_extract_vector(ctx, desc, 2, s1);
6110
6111       Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
6112       size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
6113
6114       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6115       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
6116
6117       Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
6118       size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6119
6120       Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6121       bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
6122                size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6123       if (dst.type() == RegType::vgpr)
6124          bld.copy(Definition(dst), shr_dst);
6125
6126       /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6127    } else {
6128       emit_extract_vector(ctx, desc, 2, dst);
6129    }
6130 }
6131
6132 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
6133 {
6134    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6135    const struct glsl_type *type = glsl_without_array(var->type);
6136    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6137    bool is_array = glsl_sampler_type_is_array(type);
6138    Builder bld(ctx->program, ctx->block);
6139
6140    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
6141       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
6142       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
6143    }
6144
6145    /* LOD */
6146    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6147
6148    /* Resource */
6149    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
6150
6151    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6152
6153    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
6154    mimg->operands[0] = Operand(resource);
6155    mimg->operands[1] = Operand(s4); /* no sampler */
6156    mimg->operands[2] = Operand(lod);
6157    uint8_t& dmask = mimg->dmask;
6158    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6159    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6160    mimg->da = glsl_sampler_type_is_array(type);
6161    mimg->can_reorder = true;
6162    Definition& def = mimg->definitions[0];
6163    ctx->block->instructions.emplace_back(std::move(mimg));
6164
6165    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
6166        glsl_sampler_type_is_array(type)) {
6167
6168       assert(instr->dest.ssa.num_components == 3);
6169       Temp tmp = {ctx->program->allocateId(), v3};
6170       def = Definition(tmp);
6171       emit_split_vector(ctx, tmp, 3);
6172
6173       /* divide 3rd value by 6 by multiplying with magic number */
6174       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6175       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
6176
6177       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6178                  emit_extract_vector(ctx, tmp, 0, v1),
6179                  emit_extract_vector(ctx, tmp, 1, v1),
6180                  by_6);
6181
6182    } else if (ctx->options->chip_class == GFX9 &&
6183               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
6184               glsl_sampler_type_is_array(type)) {
6185       assert(instr->dest.ssa.num_components == 2);
6186       def = Definition(dst);
6187       dmask = 0x5;
6188    } else {
6189       def = Definition(dst);
6190    }
6191
6192    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6193 }
6194
6195 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6196 {
6197    Builder bld(ctx->program, ctx->block);
6198    unsigned num_components = instr->num_components;
6199
6200    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6201    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6202    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6203
6204    unsigned access = nir_intrinsic_access(instr);
6205    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6206    unsigned size = instr->dest.ssa.bit_size / 8;
6207
6208    uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
6209    /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6210     * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6211     */
6212    bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
6213    allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
6214
6215    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6216                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
6217 }
6218
6219 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6220 {
6221    Builder bld(ctx->program, ctx->block);
6222    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6223    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6224    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6225    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6226
6227    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6228    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6229
6230    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6231    uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
6232    /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6233     * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6234     */
6235    bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
6236
6237    bool smem = !nir_src_is_divergent(instr->src[2]) &&
6238                ctx->options->chip_class >= GFX8 &&
6239                (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
6240                allow_smem;
6241    if (smem)
6242       offset = bld.as_uniform(offset);
6243    bool smem_nonfs = smem && ctx->stage != fragment_fs;
6244
6245    unsigned write_count = 0;
6246    Temp write_datas[32];
6247    unsigned offsets[32];
6248    split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
6249                       data, writemask, 16, &write_count, write_datas, offsets);
6250
6251    for (unsigned i = 0; i < write_count; i++) {
6252       aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
6253       if (smem && ctx->stage == fragment_fs)
6254          op = aco_opcode::p_fs_buffer_store_smem;
6255
6256       if (smem) {
6257          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
6258          store->operands[0] = Operand(rsrc);
6259          if (offsets[i]) {
6260             Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
6261                                       offset, Operand(offsets[i]));
6262             store->operands[1] = Operand(off);
6263          } else {
6264             store->operands[1] = Operand(offset);
6265          }
6266          if (op != aco_opcode::p_fs_buffer_store_smem)
6267             store->operands[1].setFixed(m0);
6268          store->operands[2] = Operand(write_datas[i]);
6269          store->glc = glc;
6270          store->dlc = false;
6271          store->disable_wqm = true;
6272          store->barrier = barrier_buffer;
6273          ctx->block->instructions.emplace_back(std::move(store));
6274          ctx->program->wb_smem_l1_on_end = true;
6275          if (op == aco_opcode::p_fs_buffer_store_smem) {
6276             ctx->block->kind |= block_kind_needs_lowering;
6277             ctx->program->needs_exact = true;
6278          }
6279       } else {
6280          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6281          store->operands[0] = Operand(rsrc);
6282          store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6283          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6284          store->operands[3] = Operand(write_datas[i]);
6285          store->offset = offsets[i];
6286          store->offen = (offset.type() == RegType::vgpr);
6287          store->glc = glc;
6288          store->dlc = false;
6289          store->disable_wqm = true;
6290          store->barrier = barrier_buffer;
6291          ctx->program->needs_exact = true;
6292          ctx->block->instructions.emplace_back(std::move(store));
6293       }
6294    }
6295 }
6296
6297 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6298 {
6299    /* return the previous value if dest is ever used */
6300    bool return_previous = false;
6301    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6302       return_previous = true;
6303       break;
6304    }
6305    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6306       return_previous = true;
6307       break;
6308    }
6309
6310    Builder bld(ctx->program, ctx->block);
6311    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6312
6313    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6314       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6315                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6316
6317    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6318    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6319    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6320
6321    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6322
6323    aco_opcode op32, op64;
6324    switch (instr->intrinsic) {
6325       case nir_intrinsic_ssbo_atomic_add:
6326          op32 = aco_opcode::buffer_atomic_add;
6327          op64 = aco_opcode::buffer_atomic_add_x2;
6328          break;
6329       case nir_intrinsic_ssbo_atomic_imin:
6330          op32 = aco_opcode::buffer_atomic_smin;
6331          op64 = aco_opcode::buffer_atomic_smin_x2;
6332          break;
6333       case nir_intrinsic_ssbo_atomic_umin:
6334          op32 = aco_opcode::buffer_atomic_umin;
6335          op64 = aco_opcode::buffer_atomic_umin_x2;
6336          break;
6337       case nir_intrinsic_ssbo_atomic_imax:
6338          op32 = aco_opcode::buffer_atomic_smax;
6339          op64 = aco_opcode::buffer_atomic_smax_x2;
6340          break;
6341       case nir_intrinsic_ssbo_atomic_umax:
6342          op32 = aco_opcode::buffer_atomic_umax;
6343          op64 = aco_opcode::buffer_atomic_umax_x2;
6344          break;
6345       case nir_intrinsic_ssbo_atomic_and:
6346          op32 = aco_opcode::buffer_atomic_and;
6347          op64 = aco_opcode::buffer_atomic_and_x2;
6348          break;
6349       case nir_intrinsic_ssbo_atomic_or:
6350          op32 = aco_opcode::buffer_atomic_or;
6351          op64 = aco_opcode::buffer_atomic_or_x2;
6352          break;
6353       case nir_intrinsic_ssbo_atomic_xor:
6354          op32 = aco_opcode::buffer_atomic_xor;
6355          op64 = aco_opcode::buffer_atomic_xor_x2;
6356          break;
6357       case nir_intrinsic_ssbo_atomic_exchange:
6358          op32 = aco_opcode::buffer_atomic_swap;
6359          op64 = aco_opcode::buffer_atomic_swap_x2;
6360          break;
6361       case nir_intrinsic_ssbo_atomic_comp_swap:
6362          op32 = aco_opcode::buffer_atomic_cmpswap;
6363          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6364          break;
6365       default:
6366          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6367    }
6368    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6369    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6370    mubuf->operands[0] = Operand(rsrc);
6371    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6372    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6373    mubuf->operands[3] = Operand(data);
6374    if (return_previous)
6375       mubuf->definitions[0] = Definition(dst);
6376    mubuf->offset = 0;
6377    mubuf->offen = (offset.type() == RegType::vgpr);
6378    mubuf->glc = return_previous;
6379    mubuf->dlc = false; /* Not needed for atomics */
6380    mubuf->disable_wqm = true;
6381    mubuf->barrier = barrier_buffer;
6382    ctx->program->needs_exact = true;
6383    ctx->block->instructions.emplace_back(std::move(mubuf));
6384 }
6385
6386 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
6387
6388    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6389    Builder bld(ctx->program, ctx->block);
6390    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
6391    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
6392 }
6393
6394 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
6395 {
6396    Builder bld(ctx->program, ctx->block);
6397    unsigned num_components = instr->num_components;
6398    unsigned component_size = instr->dest.ssa.bit_size / 8;
6399
6400    LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6401                         get_ssa_temp(ctx, &instr->dest.ssa),
6402                         num_components, component_size};
6403    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6404    info.align_mul = nir_intrinsic_align_mul(instr);
6405    info.align_offset = nir_intrinsic_align_offset(instr);
6406    info.barrier = barrier_buffer;
6407    info.can_reorder = false;
6408    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6409     * it's safe to use SMEM */
6410    bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6411    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
6412       emit_global_load(ctx, bld, &info);
6413    } else {
6414       info.offset = Operand(bld.as_uniform(info.offset));
6415       emit_smem_load(ctx, bld, &info);
6416    }
6417 }
6418
6419 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
6420 {
6421    Builder bld(ctx->program, ctx->block);
6422    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6423    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6424
6425    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6426    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6427    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6428
6429    if (ctx->options->chip_class >= GFX7)
6430       addr = as_vgpr(ctx, addr);
6431
6432    unsigned write_count = 0;
6433    Temp write_datas[32];
6434    unsigned offsets[32];
6435    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6436                       16, &write_count, write_datas, offsets);
6437
6438    for (unsigned i = 0; i < write_count; i++) {
6439       if (ctx->options->chip_class >= GFX7) {
6440          unsigned offset = offsets[i];
6441          Temp store_addr = addr;
6442          if (offset > 0 && ctx->options->chip_class < GFX9) {
6443             Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6444             Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6445             Temp carry = bld.tmp(bld.lm);
6446             bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6447
6448             bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
6449                      Operand(offset), addr0);
6450             bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6451                      Operand(0u), addr1,
6452                      carry).def(1).setHint(vcc);
6453
6454             store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6455
6456             offset = 0;
6457          }
6458
6459          bool global = ctx->options->chip_class >= GFX9;
6460          aco_opcode op;
6461          switch (write_datas[i].bytes()) {
6462          case 1:
6463             op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
6464             break;
6465          case 2:
6466             op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
6467             break;
6468          case 4:
6469             op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
6470             break;
6471          case 8:
6472             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6473             break;
6474          case 12:
6475             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6476             break;
6477          case 16:
6478             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6479             break;
6480          default:
6481             unreachable("store_global not implemented for this size.");
6482          }
6483
6484          aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6485          flat->operands[0] = Operand(store_addr);
6486          flat->operands[1] = Operand(s1);
6487          flat->operands[2] = Operand(write_datas[i]);
6488          flat->glc = glc;
6489          flat->dlc = false;
6490          flat->offset = offset;
6491          flat->disable_wqm = true;
6492          flat->barrier = barrier_buffer;
6493          ctx->program->needs_exact = true;
6494          ctx->block->instructions.emplace_back(std::move(flat));
6495       } else {
6496          assert(ctx->options->chip_class == GFX6);
6497
6498          aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6499
6500          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6501
6502          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6503          mubuf->operands[0] = Operand(rsrc);
6504          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6505          mubuf->operands[2] = Operand(0u);
6506          mubuf->operands[3] = Operand(write_datas[i]);
6507          mubuf->glc = glc;
6508          mubuf->dlc = false;
6509          mubuf->offset = offsets[i];
6510          mubuf->addr64 = addr.type() == RegType::vgpr;
6511          mubuf->disable_wqm = true;
6512          mubuf->barrier = barrier_buffer;
6513          ctx->program->needs_exact = true;
6514          ctx->block->instructions.emplace_back(std::move(mubuf));
6515       }
6516    }
6517 }
6518
6519 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6520 {
6521    /* return the previous value if dest is ever used */
6522    bool return_previous = false;
6523    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6524       return_previous = true;
6525       break;
6526    }
6527    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6528       return_previous = true;
6529       break;
6530    }
6531
6532    Builder bld(ctx->program, ctx->block);
6533    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6534    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6535
6536    if (ctx->options->chip_class >= GFX7)
6537       addr = as_vgpr(ctx, addr);
6538
6539    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6540       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6541                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6542
6543    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6544
6545    aco_opcode op32, op64;
6546
6547    if (ctx->options->chip_class >= GFX7) {
6548       bool global = ctx->options->chip_class >= GFX9;
6549       switch (instr->intrinsic) {
6550          case nir_intrinsic_global_atomic_add:
6551             op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6552             op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6553             break;
6554          case nir_intrinsic_global_atomic_imin:
6555             op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6556             op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6557             break;
6558          case nir_intrinsic_global_atomic_umin:
6559             op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6560             op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6561             break;
6562          case nir_intrinsic_global_atomic_imax:
6563             op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6564             op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6565             break;
6566          case nir_intrinsic_global_atomic_umax:
6567             op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6568             op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6569             break;
6570          case nir_intrinsic_global_atomic_and:
6571             op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6572             op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6573             break;
6574          case nir_intrinsic_global_atomic_or:
6575             op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6576             op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6577             break;
6578          case nir_intrinsic_global_atomic_xor:
6579             op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6580             op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6581             break;
6582          case nir_intrinsic_global_atomic_exchange:
6583             op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6584             op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6585             break;
6586          case nir_intrinsic_global_atomic_comp_swap:
6587             op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6588             op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6589             break;
6590          default:
6591             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6592       }
6593
6594       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6595       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6596       flat->operands[0] = Operand(addr);
6597       flat->operands[1] = Operand(s1);
6598       flat->operands[2] = Operand(data);
6599       if (return_previous)
6600          flat->definitions[0] = Definition(dst);
6601       flat->glc = return_previous;
6602       flat->dlc = false; /* Not needed for atomics */
6603       flat->offset = 0;
6604       flat->disable_wqm = true;
6605       flat->barrier = barrier_buffer;
6606       ctx->program->needs_exact = true;
6607       ctx->block->instructions.emplace_back(std::move(flat));
6608    } else {
6609       assert(ctx->options->chip_class == GFX6);
6610
6611       switch (instr->intrinsic) {
6612          case nir_intrinsic_global_atomic_add:
6613             op32 = aco_opcode::buffer_atomic_add;
6614             op64 = aco_opcode::buffer_atomic_add_x2;
6615             break;
6616          case nir_intrinsic_global_atomic_imin:
6617             op32 = aco_opcode::buffer_atomic_smin;
6618             op64 = aco_opcode::buffer_atomic_smin_x2;
6619             break;
6620          case nir_intrinsic_global_atomic_umin:
6621             op32 = aco_opcode::buffer_atomic_umin;
6622             op64 = aco_opcode::buffer_atomic_umin_x2;
6623             break;
6624          case nir_intrinsic_global_atomic_imax:
6625             op32 = aco_opcode::buffer_atomic_smax;
6626             op64 = aco_opcode::buffer_atomic_smax_x2;
6627             break;
6628          case nir_intrinsic_global_atomic_umax:
6629             op32 = aco_opcode::buffer_atomic_umax;
6630             op64 = aco_opcode::buffer_atomic_umax_x2;
6631             break;
6632          case nir_intrinsic_global_atomic_and:
6633             op32 = aco_opcode::buffer_atomic_and;
6634             op64 = aco_opcode::buffer_atomic_and_x2;
6635             break;
6636          case nir_intrinsic_global_atomic_or:
6637             op32 = aco_opcode::buffer_atomic_or;
6638             op64 = aco_opcode::buffer_atomic_or_x2;
6639             break;
6640          case nir_intrinsic_global_atomic_xor:
6641             op32 = aco_opcode::buffer_atomic_xor;
6642             op64 = aco_opcode::buffer_atomic_xor_x2;
6643             break;
6644          case nir_intrinsic_global_atomic_exchange:
6645             op32 = aco_opcode::buffer_atomic_swap;
6646             op64 = aco_opcode::buffer_atomic_swap_x2;
6647             break;
6648          case nir_intrinsic_global_atomic_comp_swap:
6649             op32 = aco_opcode::buffer_atomic_cmpswap;
6650             op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6651             break;
6652          default:
6653             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6654       }
6655
6656       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6657
6658       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6659
6660       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6661       mubuf->operands[0] = Operand(rsrc);
6662       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6663       mubuf->operands[2] = Operand(0u);
6664       mubuf->operands[3] = Operand(data);
6665       if (return_previous)
6666          mubuf->definitions[0] = Definition(dst);
6667       mubuf->glc = return_previous;
6668       mubuf->dlc = false;
6669       mubuf->offset = 0;
6670       mubuf->addr64 = addr.type() == RegType::vgpr;
6671       mubuf->disable_wqm = true;
6672       mubuf->barrier = barrier_buffer;
6673       ctx->program->needs_exact = true;
6674       ctx->block->instructions.emplace_back(std::move(mubuf));
6675    }
6676 }
6677
6678 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
6679    Builder bld(ctx->program, ctx->block);
6680    switch(instr->intrinsic) {
6681       case nir_intrinsic_group_memory_barrier:
6682       case nir_intrinsic_memory_barrier:
6683          bld.barrier(aco_opcode::p_memory_barrier_common);
6684          break;
6685       case nir_intrinsic_memory_barrier_buffer:
6686          bld.barrier(aco_opcode::p_memory_barrier_buffer);
6687          break;
6688       case nir_intrinsic_memory_barrier_image:
6689          bld.barrier(aco_opcode::p_memory_barrier_image);
6690          break;
6691       case nir_intrinsic_memory_barrier_tcs_patch:
6692       case nir_intrinsic_memory_barrier_shared:
6693          bld.barrier(aco_opcode::p_memory_barrier_shared);
6694          break;
6695       default:
6696          unreachable("Unimplemented memory barrier intrinsic");
6697          break;
6698    }
6699 }
6700
6701 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6702 {
6703    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
6704    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6705    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6706    Builder bld(ctx->program, ctx->block);
6707
6708    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
6709    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6710    load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
6711 }
6712
6713 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6714 {
6715    unsigned writemask = nir_intrinsic_write_mask(instr);
6716    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6717    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6718    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6719
6720    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6721    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
6722 }
6723
6724 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6725 {
6726    unsigned offset = nir_intrinsic_base(instr);
6727    Builder bld(ctx->program, ctx->block);
6728    Operand m = load_lds_size_m0(bld);
6729    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6730    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6731
6732    unsigned num_operands = 3;
6733    aco_opcode op32, op64, op32_rtn, op64_rtn;
6734    switch(instr->intrinsic) {
6735       case nir_intrinsic_shared_atomic_add:
6736          op32 = aco_opcode::ds_add_u32;
6737          op64 = aco_opcode::ds_add_u64;
6738          op32_rtn = aco_opcode::ds_add_rtn_u32;
6739          op64_rtn = aco_opcode::ds_add_rtn_u64;
6740          break;
6741       case nir_intrinsic_shared_atomic_imin:
6742          op32 = aco_opcode::ds_min_i32;
6743          op64 = aco_opcode::ds_min_i64;
6744          op32_rtn = aco_opcode::ds_min_rtn_i32;
6745          op64_rtn = aco_opcode::ds_min_rtn_i64;
6746          break;
6747       case nir_intrinsic_shared_atomic_umin:
6748          op32 = aco_opcode::ds_min_u32;
6749          op64 = aco_opcode::ds_min_u64;
6750          op32_rtn = aco_opcode::ds_min_rtn_u32;
6751          op64_rtn = aco_opcode::ds_min_rtn_u64;
6752          break;
6753       case nir_intrinsic_shared_atomic_imax:
6754          op32 = aco_opcode::ds_max_i32;
6755          op64 = aco_opcode::ds_max_i64;
6756          op32_rtn = aco_opcode::ds_max_rtn_i32;
6757          op64_rtn = aco_opcode::ds_max_rtn_i64;
6758          break;
6759       case nir_intrinsic_shared_atomic_umax:
6760          op32 = aco_opcode::ds_max_u32;
6761          op64 = aco_opcode::ds_max_u64;
6762          op32_rtn = aco_opcode::ds_max_rtn_u32;
6763          op64_rtn = aco_opcode::ds_max_rtn_u64;
6764          break;
6765       case nir_intrinsic_shared_atomic_and:
6766          op32 = aco_opcode::ds_and_b32;
6767          op64 = aco_opcode::ds_and_b64;
6768          op32_rtn = aco_opcode::ds_and_rtn_b32;
6769          op64_rtn = aco_opcode::ds_and_rtn_b64;
6770          break;
6771       case nir_intrinsic_shared_atomic_or:
6772          op32 = aco_opcode::ds_or_b32;
6773          op64 = aco_opcode::ds_or_b64;
6774          op32_rtn = aco_opcode::ds_or_rtn_b32;
6775          op64_rtn = aco_opcode::ds_or_rtn_b64;
6776          break;
6777       case nir_intrinsic_shared_atomic_xor:
6778          op32 = aco_opcode::ds_xor_b32;
6779          op64 = aco_opcode::ds_xor_b64;
6780          op32_rtn = aco_opcode::ds_xor_rtn_b32;
6781          op64_rtn = aco_opcode::ds_xor_rtn_b64;
6782          break;
6783       case nir_intrinsic_shared_atomic_exchange:
6784          op32 = aco_opcode::ds_write_b32;
6785          op64 = aco_opcode::ds_write_b64;
6786          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
6787          op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
6788          break;
6789       case nir_intrinsic_shared_atomic_comp_swap:
6790          op32 = aco_opcode::ds_cmpst_b32;
6791          op64 = aco_opcode::ds_cmpst_b64;
6792          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
6793          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
6794          num_operands = 4;
6795          break;
6796       case nir_intrinsic_shared_atomic_fadd:
6797          op32 = aco_opcode::ds_add_f32;
6798          op32_rtn = aco_opcode::ds_add_rtn_f32;
6799          op64 = aco_opcode::num_opcodes;
6800          op64_rtn = aco_opcode::num_opcodes;
6801          break;
6802       default:
6803          unreachable("Unhandled shared atomic intrinsic");
6804    }
6805
6806    /* return the previous value if dest is ever used */
6807    bool return_previous = false;
6808    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6809       return_previous = true;
6810       break;
6811    }
6812    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6813       return_previous = true;
6814       break;
6815    }
6816
6817    aco_opcode op;
6818    if (data.size() == 1) {
6819       assert(instr->dest.ssa.bit_size == 32);
6820       op = return_previous ? op32_rtn : op32;
6821    } else {
6822       assert(instr->dest.ssa.bit_size == 64);
6823       op = return_previous ? op64_rtn : op64;
6824    }
6825
6826    if (offset > 65535) {
6827       address = bld.vadd32(bld.def(v1), Operand(offset), address);
6828       offset = 0;
6829    }
6830
6831    aco_ptr<DS_instruction> ds;
6832    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
6833    ds->operands[0] = Operand(address);
6834    ds->operands[1] = Operand(data);
6835    if (num_operands == 4)
6836       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
6837    ds->operands[num_operands - 1] = m;
6838    ds->offset0 = offset;
6839    if (return_previous)
6840       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
6841    ctx->block->instructions.emplace_back(std::move(ds));
6842 }
6843
6844 Temp get_scratch_resource(isel_context *ctx)
6845 {
6846    Builder bld(ctx->program, ctx->block);
6847    Temp scratch_addr = ctx->program->private_segment_buffer;
6848    if (ctx->stage != compute_cs)
6849       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
6850
6851    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
6852                         S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
6853
6854    if (ctx->program->chip_class >= GFX10) {
6855       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
6856                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
6857                    S_008F0C_RESOURCE_LEVEL(1);
6858    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
6859       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6860                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6861    }
6862
6863    /* older generations need element size = 16 bytes. element size removed in GFX9 */
6864    if (ctx->program->chip_class <= GFX8)
6865       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
6866
6867    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
6868 }
6869
6870 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6871    Builder bld(ctx->program, ctx->block);
6872    Temp rsrc = get_scratch_resource(ctx);
6873    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6874    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6875
6876    LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
6877                         instr->dest.ssa.bit_size / 8u, rsrc};
6878    info.align_mul = nir_intrinsic_align_mul(instr);
6879    info.align_offset = nir_intrinsic_align_offset(instr);
6880    info.swizzle_component_size = 16;
6881    info.can_reorder = false;
6882    info.soffset = ctx->program->scratch_offset;
6883    emit_mubuf_load(ctx, bld, &info);
6884 }
6885
6886 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6887    Builder bld(ctx->program, ctx->block);
6888    Temp rsrc = get_scratch_resource(ctx);
6889    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6890    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6891
6892    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6893    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6894
6895    unsigned write_count = 0;
6896    Temp write_datas[32];
6897    unsigned offsets[32];
6898    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6899                       16, &write_count, write_datas, offsets);
6900
6901    for (unsigned i = 0; i < write_count; i++) {
6902       aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6903       bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
6904    }
6905 }
6906
6907 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
6908    uint8_t log2_ps_iter_samples;
6909    if (ctx->program->info->ps.force_persample) {
6910       log2_ps_iter_samples =
6911          util_logbase2(ctx->options->key.fs.num_samples);
6912    } else {
6913       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
6914    }
6915
6916    /* The bit pattern matches that used by fixed function fragment
6917     * processing. */
6918    static const unsigned ps_iter_masks[] = {
6919       0xffff, /* not used */
6920       0x5555,
6921       0x1111,
6922       0x0101,
6923       0x0001,
6924    };
6925    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
6926
6927    Builder bld(ctx->program, ctx->block);
6928
6929    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
6930                              get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
6931    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
6932    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
6933    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6934    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
6935 }
6936
6937 void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
6938    Builder bld(ctx->program, ctx->block);
6939
6940    unsigned stream = nir_intrinsic_stream_id(instr);
6941    Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6942    next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
6943    nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
6944
6945    /* get GSVS ring */
6946    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
6947
6948    unsigned num_components =
6949       ctx->program->info->gs.num_stream_output_components[stream];
6950    assert(num_components);
6951
6952    unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
6953    unsigned stream_offset = 0;
6954    for (unsigned i = 0; i < stream; i++) {
6955       unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
6956       stream_offset += prev_stride * ctx->program->wave_size;
6957    }
6958
6959    /* Limit on the stride field for <= GFX7. */
6960    assert(stride < (1 << 14));
6961
6962    Temp gsvs_dwords[4];
6963    for (unsigned i = 0; i < 4; i++)
6964       gsvs_dwords[i] = bld.tmp(s1);
6965    bld.pseudo(aco_opcode::p_split_vector,
6966               Definition(gsvs_dwords[0]),
6967               Definition(gsvs_dwords[1]),
6968               Definition(gsvs_dwords[2]),
6969               Definition(gsvs_dwords[3]),
6970               gsvs_ring);
6971
6972    if (stream_offset) {
6973       Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
6974
6975       Temp carry = bld.tmp(s1);
6976       gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
6977       gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
6978    }
6979
6980    gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
6981    gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
6982
6983    gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6984                           gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
6985
6986    unsigned offset = 0;
6987    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
6988       if (ctx->program->info->gs.output_streams[i] != stream)
6989          continue;
6990
6991       for (unsigned j = 0; j < 4; j++) {
6992          if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
6993             continue;
6994
6995          if (ctx->outputs.mask[i] & (1 << j)) {
6996             Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
6997             unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
6998             if (const_offset >= 4096u) {
6999                if (vaddr_offset.isUndefined())
7000                   vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
7001                else
7002                   vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
7003                const_offset %= 4096u;
7004             }
7005
7006             aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
7007             mtbuf->operands[0] = Operand(gsvs_ring);
7008             mtbuf->operands[1] = vaddr_offset;
7009             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
7010             mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
7011             mtbuf->offen = !vaddr_offset.isUndefined();
7012             mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
7013             mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
7014             mtbuf->offset = const_offset;
7015             mtbuf->glc = true;
7016             mtbuf->slc = true;
7017             mtbuf->barrier = barrier_gs_data;
7018             mtbuf->can_reorder = true;
7019             bld.insert(std::move(mtbuf));
7020          }
7021
7022          offset += ctx->shader->info.gs.vertices_out;
7023       }
7024
7025       /* outputs for the next vertex are undefined and keeping them around can
7026        * create invalid IR with control flow */
7027       ctx->outputs.mask[i] = 0;
7028    }
7029
7030    bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7031 }
7032
7033 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
7034 {
7035    Builder bld(ctx->program, ctx->block);
7036
7037    if (cluster_size == 1) {
7038       return src;
7039    } if (op == nir_op_iand && cluster_size == 4) {
7040       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
7041       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7042       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7043                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7044    } else if (op == nir_op_ior && cluster_size == 4) {
7045       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
7046       return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7047                       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7048    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7049       //subgroupAnd(val) -> (exec & ~val) == 0
7050       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7051       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7052       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7053    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7054       //subgroupOr(val) -> (val & exec) != 0
7055       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
7056       return bool_to_vector_condition(ctx, tmp);
7057    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7058       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
7059       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7060       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7061       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
7062       return bool_to_vector_condition(ctx, tmp);
7063    } else {
7064       //subgroupClustered{And,Or,Xor}(val, n) ->
7065       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ;  just v_mbcnt_lo_u32_b32 on wave32
7066       //cluster_offset = ~(n - 1) & lane_id
7067       //cluster_mask = ((1 << n) - 1)
7068       //subgroupClusteredAnd():
7069       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7070       //subgroupClusteredOr():
7071       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7072       //subgroupClusteredXor():
7073       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7074       Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
7075       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
7076
7077       Temp tmp;
7078       if (op == nir_op_iand)
7079          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7080       else
7081          tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7082
7083       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7084
7085       if (ctx->program->chip_class <= GFX7)
7086          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7087       else if (ctx->program->wave_size == 64)
7088          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7089       else
7090          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7091       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7092       if (cluster_mask != 0xffffffff)
7093          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
7094
7095       Definition cmp_def = Definition();
7096       if (op == nir_op_iand) {
7097          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
7098       } else if (op == nir_op_ior) {
7099          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7100       } else if (op == nir_op_ixor) {
7101          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
7102                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
7103          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7104       }
7105       cmp_def.setHint(vcc);
7106       return cmp_def.getTemp();
7107    }
7108 }
7109
7110 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
7111 {
7112    Builder bld(ctx->program, ctx->block);
7113
7114    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7115    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7116    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7117    Temp tmp;
7118    if (op == nir_op_iand)
7119       tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7120    else
7121       tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
7122
7123    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
7124    Temp lo = lohi.def(0).getTemp();
7125    Temp hi = lohi.def(1).getTemp();
7126    Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi));
7127
7128    Definition cmp_def = Definition();
7129    if (op == nir_op_iand)
7130       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7131    else if (op == nir_op_ior)
7132       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7133    else if (op == nir_op_ixor)
7134       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
7135                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
7136    cmp_def.setHint(vcc);
7137    return cmp_def.getTemp();
7138 }
7139
7140 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
7141 {
7142    Builder bld(ctx->program, ctx->block);
7143
7144    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7145    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7146    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7147    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7148    if (op == nir_op_iand)
7149       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7150    else if (op == nir_op_ior)
7151       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7152    else if (op == nir_op_ixor)
7153       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7154
7155    assert(false);
7156    return Temp();
7157 }
7158
7159 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
7160 {
7161    Builder bld(ctx->program, ctx->block);
7162    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7163    if (src.regClass().type() == RegType::vgpr) {
7164       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7165    } else if (src.regClass() == s1) {
7166       bld.sop1(aco_opcode::s_mov_b32, dst, src);
7167    } else if (src.regClass() == s2) {
7168       bld.sop1(aco_opcode::s_mov_b64, dst, src);
7169    } else {
7170       fprintf(stderr, "Unimplemented NIR instr bit size: ");
7171       nir_print_instr(&instr->instr, stderr);
7172       fprintf(stderr, "\n");
7173    }
7174 }
7175
7176 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
7177 {
7178    Builder bld(ctx->program, ctx->block);
7179    Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
7180    Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
7181    Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
7182
7183    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7184    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7185    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7186    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7187
7188    /* Build DD X/Y */
7189    if (ctx->program->chip_class >= GFX8) {
7190       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7191       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7192       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7193       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7194       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7195       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7196    } else {
7197       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7198       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7199       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7200       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7201       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7202       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7203       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7204       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7205       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7206       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7207    }
7208
7209    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7210    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
7211    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
7212    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
7213    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
7214    Temp wqm1 = bld.tmp(v1);
7215    emit_wqm(ctx, tmp1, wqm1, true);
7216    Temp wqm2 = bld.tmp(v1);
7217    emit_wqm(ctx, tmp2, wqm2, true);
7218    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7219    return;
7220 }
7221
7222 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
7223 {
7224    Builder bld(ctx->program, ctx->block);
7225    switch(instr->intrinsic) {
7226    case nir_intrinsic_load_barycentric_sample:
7227    case nir_intrinsic_load_barycentric_pixel:
7228    case nir_intrinsic_load_barycentric_centroid: {
7229       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7230       Temp bary = Temp(0, s2);
7231       switch (mode) {
7232       case INTERP_MODE_SMOOTH:
7233       case INTERP_MODE_NONE:
7234          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7235             bary = get_arg(ctx, ctx->args->ac.persp_center);
7236          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7237             bary = ctx->persp_centroid;
7238          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7239             bary = get_arg(ctx, ctx->args->ac.persp_sample);
7240          break;
7241       case INTERP_MODE_NOPERSPECTIVE:
7242          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7243             bary = get_arg(ctx, ctx->args->ac.linear_center);
7244          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7245             bary = ctx->linear_centroid;
7246          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7247             bary = get_arg(ctx, ctx->args->ac.linear_sample);
7248          break;
7249       default:
7250          break;
7251       }
7252       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7253       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7254       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7255       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7256                  Operand(p1), Operand(p2));
7257       emit_split_vector(ctx, dst, 2);
7258       break;
7259    }
7260    case nir_intrinsic_load_barycentric_model: {
7261       Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7262
7263       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7264       Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7265       Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7266       Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7267       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7268                  Operand(p1), Operand(p2), Operand(p3));
7269       emit_split_vector(ctx, dst, 3);
7270       break;
7271    }
7272    case nir_intrinsic_load_barycentric_at_sample: {
7273       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7274       switch (ctx->options->key.fs.num_samples) {
7275          case 2: sample_pos_offset += 1 << 3; break;
7276          case 4: sample_pos_offset += 3 << 3; break;
7277          case 8: sample_pos_offset += 7 << 3; break;
7278          default: break;
7279       }
7280       Temp sample_pos;
7281       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7282       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7283       Temp private_segment_buffer = ctx->program->private_segment_buffer;
7284       //TODO: bounds checking?
7285       if (addr.type() == RegType::sgpr) {
7286          Operand offset;
7287          if (const_addr) {
7288             sample_pos_offset += const_addr->u32 << 3;
7289             offset = Operand(sample_pos_offset);
7290          } else if (ctx->options->chip_class >= GFX9) {
7291             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7292          } else {
7293             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
7294             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7295          }
7296
7297          Operand off = bld.copy(bld.def(s1), Operand(offset));
7298          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7299
7300       } else if (ctx->options->chip_class >= GFX9) {
7301          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7302          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
7303       } else if (ctx->options->chip_class >= GFX7) {
7304          /* addr += private_segment_buffer + sample_pos_offset */
7305          Temp tmp0 = bld.tmp(s1);
7306          Temp tmp1 = bld.tmp(s1);
7307          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
7308          Definition scc_tmp = bld.def(s1, scc);
7309          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
7310          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
7311          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7312          Temp pck0 = bld.tmp(v1);
7313          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7314          tmp1 = as_vgpr(ctx, tmp1);
7315          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
7316          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7317
7318          /* sample_pos = flat_load_dwordx2 addr */
7319          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
7320       } else {
7321          assert(ctx->options->chip_class == GFX6);
7322
7323          uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7324                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7325          Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
7326
7327          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7328          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
7329
7330          sample_pos = bld.tmp(v2);
7331
7332          aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
7333          load->definitions[0] = Definition(sample_pos);
7334          load->operands[0] = Operand(rsrc);
7335          load->operands[1] = Operand(addr);
7336          load->operands[2] = Operand(0u);
7337          load->offset = sample_pos_offset;
7338          load->offen = 0;
7339          load->addr64 = true;
7340          load->glc = false;
7341          load->dlc = false;
7342          load->disable_wqm = false;
7343          load->barrier = barrier_none;
7344          load->can_reorder = true;
7345          ctx->block->instructions.emplace_back(std::move(load));
7346       }
7347
7348       /* sample_pos -= 0.5 */
7349       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
7350       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
7351       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
7352       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
7353       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
7354
7355       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7356       break;
7357    }
7358    case nir_intrinsic_load_barycentric_at_offset: {
7359       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
7360       RegClass rc = RegClass(offset.type(), 1);
7361       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
7362       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
7363       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7364       break;
7365    }
7366    case nir_intrinsic_load_front_face: {
7367       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7368                Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
7369       break;
7370    }
7371    case nir_intrinsic_load_view_index: {
7372       if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
7373          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7374          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
7375          break;
7376       }
7377
7378       /* fallthrough */
7379    }
7380    case nir_intrinsic_load_layer_id: {
7381       unsigned idx = nir_intrinsic_base(instr);
7382       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7383                  Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
7384       break;
7385    }
7386    case nir_intrinsic_load_frag_coord: {
7387       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
7388       break;
7389    }
7390    case nir_intrinsic_load_sample_pos: {
7391       Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
7392       Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
7393       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7394                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
7395                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
7396       break;
7397    }
7398    case nir_intrinsic_load_tess_coord:
7399       visit_load_tess_coord(ctx, instr);
7400       break;
7401    case nir_intrinsic_load_interpolated_input:
7402       visit_load_interpolated_input(ctx, instr);
7403       break;
7404    case nir_intrinsic_store_output:
7405       visit_store_output(ctx, instr);
7406       break;
7407    case nir_intrinsic_load_input:
7408    case nir_intrinsic_load_input_vertex:
7409       visit_load_input(ctx, instr);
7410       break;
7411    case nir_intrinsic_load_output:
7412       visit_load_output(ctx, instr);
7413       break;
7414    case nir_intrinsic_load_per_vertex_input:
7415       visit_load_per_vertex_input(ctx, instr);
7416       break;
7417    case nir_intrinsic_load_per_vertex_output:
7418       visit_load_per_vertex_output(ctx, instr);
7419       break;
7420    case nir_intrinsic_store_per_vertex_output:
7421       visit_store_per_vertex_output(ctx, instr);
7422       break;
7423    case nir_intrinsic_load_ubo:
7424       visit_load_ubo(ctx, instr);
7425       break;
7426    case nir_intrinsic_load_push_constant:
7427       visit_load_push_constant(ctx, instr);
7428       break;
7429    case nir_intrinsic_load_constant:
7430       visit_load_constant(ctx, instr);
7431       break;
7432    case nir_intrinsic_vulkan_resource_index:
7433       visit_load_resource(ctx, instr);
7434       break;
7435    case nir_intrinsic_discard:
7436       visit_discard(ctx, instr);
7437       break;
7438    case nir_intrinsic_discard_if:
7439       visit_discard_if(ctx, instr);
7440       break;
7441    case nir_intrinsic_load_shared:
7442       visit_load_shared(ctx, instr);
7443       break;
7444    case nir_intrinsic_store_shared:
7445       visit_store_shared(ctx, instr);
7446       break;
7447    case nir_intrinsic_shared_atomic_add:
7448    case nir_intrinsic_shared_atomic_imin:
7449    case nir_intrinsic_shared_atomic_umin:
7450    case nir_intrinsic_shared_atomic_imax:
7451    case nir_intrinsic_shared_atomic_umax:
7452    case nir_intrinsic_shared_atomic_and:
7453    case nir_intrinsic_shared_atomic_or:
7454    case nir_intrinsic_shared_atomic_xor:
7455    case nir_intrinsic_shared_atomic_exchange:
7456    case nir_intrinsic_shared_atomic_comp_swap:
7457    case nir_intrinsic_shared_atomic_fadd:
7458       visit_shared_atomic(ctx, instr);
7459       break;
7460    case nir_intrinsic_image_deref_load:
7461       visit_image_load(ctx, instr);
7462       break;
7463    case nir_intrinsic_image_deref_store:
7464       visit_image_store(ctx, instr);
7465       break;
7466    case nir_intrinsic_image_deref_atomic_add:
7467    case nir_intrinsic_image_deref_atomic_umin:
7468    case nir_intrinsic_image_deref_atomic_imin:
7469    case nir_intrinsic_image_deref_atomic_umax:
7470    case nir_intrinsic_image_deref_atomic_imax:
7471    case nir_intrinsic_image_deref_atomic_and:
7472    case nir_intrinsic_image_deref_atomic_or:
7473    case nir_intrinsic_image_deref_atomic_xor:
7474    case nir_intrinsic_image_deref_atomic_exchange:
7475    case nir_intrinsic_image_deref_atomic_comp_swap:
7476       visit_image_atomic(ctx, instr);
7477       break;
7478    case nir_intrinsic_image_deref_size:
7479       visit_image_size(ctx, instr);
7480       break;
7481    case nir_intrinsic_load_ssbo:
7482       visit_load_ssbo(ctx, instr);
7483       break;
7484    case nir_intrinsic_store_ssbo:
7485       visit_store_ssbo(ctx, instr);
7486       break;
7487    case nir_intrinsic_load_global:
7488       visit_load_global(ctx, instr);
7489       break;
7490    case nir_intrinsic_store_global:
7491       visit_store_global(ctx, instr);
7492       break;
7493    case nir_intrinsic_global_atomic_add:
7494    case nir_intrinsic_global_atomic_imin:
7495    case nir_intrinsic_global_atomic_umin:
7496    case nir_intrinsic_global_atomic_imax:
7497    case nir_intrinsic_global_atomic_umax:
7498    case nir_intrinsic_global_atomic_and:
7499    case nir_intrinsic_global_atomic_or:
7500    case nir_intrinsic_global_atomic_xor:
7501    case nir_intrinsic_global_atomic_exchange:
7502    case nir_intrinsic_global_atomic_comp_swap:
7503       visit_global_atomic(ctx, instr);
7504       break;
7505    case nir_intrinsic_ssbo_atomic_add:
7506    case nir_intrinsic_ssbo_atomic_imin:
7507    case nir_intrinsic_ssbo_atomic_umin:
7508    case nir_intrinsic_ssbo_atomic_imax:
7509    case nir_intrinsic_ssbo_atomic_umax:
7510    case nir_intrinsic_ssbo_atomic_and:
7511    case nir_intrinsic_ssbo_atomic_or:
7512    case nir_intrinsic_ssbo_atomic_xor:
7513    case nir_intrinsic_ssbo_atomic_exchange:
7514    case nir_intrinsic_ssbo_atomic_comp_swap:
7515       visit_atomic_ssbo(ctx, instr);
7516       break;
7517    case nir_intrinsic_load_scratch:
7518       visit_load_scratch(ctx, instr);
7519       break;
7520    case nir_intrinsic_store_scratch:
7521       visit_store_scratch(ctx, instr);
7522       break;
7523    case nir_intrinsic_get_buffer_size:
7524       visit_get_buffer_size(ctx, instr);
7525       break;
7526    case nir_intrinsic_control_barrier: {
7527       if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
7528          /* GFX6 only (thanks to a hw bug workaround):
7529           * The real barrier instruction isn’t needed, because an entire patch
7530           * always fits into a single wave.
7531           */
7532          break;
7533       }
7534
7535       if (ctx->program->workgroup_size > ctx->program->wave_size)
7536          bld.sopp(aco_opcode::s_barrier);
7537
7538       break;
7539    }
7540    case nir_intrinsic_memory_barrier_tcs_patch:
7541    case nir_intrinsic_group_memory_barrier:
7542    case nir_intrinsic_memory_barrier:
7543    case nir_intrinsic_memory_barrier_buffer:
7544    case nir_intrinsic_memory_barrier_image:
7545    case nir_intrinsic_memory_barrier_shared:
7546       emit_memory_barrier(ctx, instr);
7547       break;
7548    case nir_intrinsic_load_num_work_groups: {
7549       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7550       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
7551       emit_split_vector(ctx, dst, 3);
7552       break;
7553    }
7554    case nir_intrinsic_load_local_invocation_id: {
7555       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7556       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
7557       emit_split_vector(ctx, dst, 3);
7558       break;
7559    }
7560    case nir_intrinsic_load_work_group_id: {
7561       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7562       struct ac_arg *args = ctx->args->ac.workgroup_ids;
7563       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7564                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
7565                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
7566                  args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
7567       emit_split_vector(ctx, dst, 3);
7568       break;
7569    }
7570    case nir_intrinsic_load_local_invocation_index: {
7571       Temp id = emit_mbcnt(ctx, bld.def(v1));
7572
7573       /* The tg_size bits [6:11] contain the subgroup id,
7574        * we need this multiplied by the wave size, and then OR the thread id to it.
7575        */
7576       if (ctx->program->wave_size == 64) {
7577          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
7578          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
7579                                 get_arg(ctx, ctx->args->ac.tg_size));
7580          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
7581       } else {
7582          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR  */
7583          Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7584                                 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7585          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
7586       }
7587       break;
7588    }
7589    case nir_intrinsic_load_subgroup_id: {
7590       if (ctx->stage == compute_cs) {
7591          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
7592                   get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7593       } else {
7594          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
7595       }
7596       break;
7597    }
7598    case nir_intrinsic_load_subgroup_invocation: {
7599       emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
7600       break;
7601    }
7602    case nir_intrinsic_load_num_subgroups: {
7603       if (ctx->stage == compute_cs)
7604          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
7605                   get_arg(ctx, ctx->args->ac.tg_size));
7606       else
7607          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
7608       break;
7609    }
7610    case nir_intrinsic_ballot: {
7611       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7612       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7613       Definition tmp = bld.def(dst.regClass());
7614       Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass());
7615       if (instr->src[0].ssa->bit_size == 1) {
7616          assert(src.regClass() == bld.lm);
7617          bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
7618       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
7619          bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
7620       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
7621          bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
7622       } else {
7623          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7624          nir_print_instr(&instr->instr, stderr);
7625          fprintf(stderr, "\n");
7626       }
7627       if (dst.size() != bld.lm.size()) {
7628          /* Wave32 with ballot size set to 64 */
7629          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
7630       }
7631       emit_wqm(ctx, tmp.getTemp(), dst);
7632       break;
7633    }
7634    case nir_intrinsic_shuffle:
7635    case nir_intrinsic_read_invocation: {
7636       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7637       if (!nir_src_is_divergent(instr->src[0])) {
7638          emit_uniform_subgroup(ctx, instr, src);
7639       } else {
7640          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
7641          if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
7642             tid = bld.as_uniform(tid);
7643          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7644          if (src.regClass() == v1b || src.regClass() == v2b) {
7645             Temp tmp = bld.tmp(v1);
7646             tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp);
7647             if (dst.type() == RegType::vgpr)
7648                bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
7649             else
7650                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
7651          } else if (src.regClass() == v1) {
7652             emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
7653          } else if (src.regClass() == v2) {
7654             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7655             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7656             lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
7657             hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
7658             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7659             emit_split_vector(ctx, dst, 2);
7660          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
7661             assert(src.regClass() == bld.lm);
7662             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
7663             bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7664          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
7665             assert(src.regClass() == bld.lm);
7666             Temp tmp;
7667             if (ctx->program->chip_class <= GFX7)
7668                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
7669             else if (ctx->program->wave_size == 64)
7670                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
7671             else
7672                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
7673             tmp = emit_extract_vector(ctx, tmp, 0, v1);
7674             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
7675             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
7676          } else {
7677             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7678             nir_print_instr(&instr->instr, stderr);
7679             fprintf(stderr, "\n");
7680          }
7681       }
7682       break;
7683    }
7684    case nir_intrinsic_load_sample_id: {
7685       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7686                get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
7687       break;
7688    }
7689    case nir_intrinsic_load_sample_mask_in: {
7690       visit_load_sample_mask_in(ctx, instr);
7691       break;
7692    }
7693    case nir_intrinsic_read_first_invocation: {
7694       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7695       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7696       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
7697          emit_wqm(ctx,
7698                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
7699                   dst);
7700       } else if (src.regClass() == v2) {
7701          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7702          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7703          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
7704          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
7705          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7706          emit_split_vector(ctx, dst, 2);
7707       } else if (instr->dest.ssa.bit_size == 1) {
7708          assert(src.regClass() == bld.lm);
7709          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
7710                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
7711          bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7712       } else if (src.regClass() == s1) {
7713          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
7714       } else if (src.regClass() == s2) {
7715          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
7716       } else {
7717          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7718          nir_print_instr(&instr->instr, stderr);
7719          fprintf(stderr, "\n");
7720       }
7721       break;
7722    }
7723    case nir_intrinsic_vote_all: {
7724       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7725       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7726       assert(src.regClass() == bld.lm);
7727       assert(dst.regClass() == bld.lm);
7728
7729       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7730       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7731       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
7732       break;
7733    }
7734    case nir_intrinsic_vote_any: {
7735       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7736       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7737       assert(src.regClass() == bld.lm);
7738       assert(dst.regClass() == bld.lm);
7739
7740       Temp tmp = bool_to_scalar_condition(ctx, src);
7741       bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7742       break;
7743    }
7744    case nir_intrinsic_reduce:
7745    case nir_intrinsic_inclusive_scan:
7746    case nir_intrinsic_exclusive_scan: {
7747       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7748       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7749       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
7750       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
7751          nir_intrinsic_cluster_size(instr) : 0;
7752       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
7753
7754       if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
7755          emit_uniform_subgroup(ctx, instr, src);
7756       } else if (instr->dest.ssa.bit_size == 1) {
7757          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
7758             op = nir_op_iand;
7759          else if (op == nir_op_iadd)
7760             op = nir_op_ixor;
7761          else if (op == nir_op_umax || op == nir_op_imax)
7762             op = nir_op_ior;
7763          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
7764
7765          switch (instr->intrinsic) {
7766          case nir_intrinsic_reduce:
7767             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
7768             break;
7769          case nir_intrinsic_exclusive_scan:
7770             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
7771             break;
7772          case nir_intrinsic_inclusive_scan:
7773             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
7774             break;
7775          default:
7776             assert(false);
7777          }
7778       } else if (cluster_size == 1) {
7779          bld.copy(Definition(dst), src);
7780       } else {
7781          unsigned bit_size = instr->src[0].ssa->bit_size;
7782
7783          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
7784
7785          ReduceOp reduce_op;
7786          switch (op) {
7787          #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
7788          #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
7789             CASEI(iadd)
7790             CASEI(imul)
7791             CASEI(imin)
7792             CASEI(umin)
7793             CASEI(imax)
7794             CASEI(umax)
7795             CASEI(iand)
7796             CASEI(ior)
7797             CASEI(ixor)
7798             CASEF(fadd)
7799             CASEF(fmul)
7800             CASEF(fmin)
7801             CASEF(fmax)
7802             default:
7803                unreachable("unknown reduction op");
7804          #undef CASEI
7805          #undef CASEF
7806          }
7807
7808          aco_opcode aco_op;
7809          switch (instr->intrinsic) {
7810             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
7811             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
7812             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
7813             default:
7814                unreachable("unknown reduce intrinsic");
7815          }
7816
7817          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
7818          reduce->operands[0] = Operand(src);
7819          // filled in by aco_reduce_assign.cpp, used internally as part of the
7820          // reduce sequence
7821          assert(dst.size() == 1 || dst.size() == 2);
7822          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7823          reduce->operands[2] = Operand(v1.as_linear());
7824
7825          Temp tmp_dst = bld.tmp(dst.regClass());
7826          reduce->definitions[0] = Definition(tmp_dst);
7827          reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
7828          reduce->definitions[2] = Definition();
7829          reduce->definitions[3] = Definition(scc, s1);
7830          reduce->definitions[4] = Definition();
7831          reduce->reduce_op = reduce_op;
7832          reduce->cluster_size = cluster_size;
7833          ctx->block->instructions.emplace_back(std::move(reduce));
7834
7835          emit_wqm(ctx, tmp_dst, dst);
7836       }
7837       break;
7838    }
7839    case nir_intrinsic_quad_broadcast: {
7840       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7841       if (!nir_dest_is_divergent(instr->dest)) {
7842          emit_uniform_subgroup(ctx, instr, src);
7843       } else {
7844          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7845          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
7846          uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
7847
7848          if (instr->dest.ssa.bit_size == 1) {
7849             assert(src.regClass() == bld.lm);
7850             assert(dst.regClass() == bld.lm);
7851             uint32_t half_mask = 0x11111111u << lane;
7852             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
7853             Temp tmp = bld.tmp(bld.lm);
7854             bld.sop1(Builder::s_wqm, Definition(tmp),
7855                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
7856                               bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
7857             emit_wqm(ctx, tmp, dst);
7858          } else if (instr->dest.ssa.bit_size == 8) {
7859             Temp tmp = bld.tmp(v1);
7860             if (ctx->program->chip_class >= GFX8)
7861                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7862             else
7863                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7864             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7865          } else if (instr->dest.ssa.bit_size == 16) {
7866             Temp tmp = bld.tmp(v1);
7867             if (ctx->program->chip_class >= GFX8)
7868                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7869             else
7870                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7871             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7872          } else if (instr->dest.ssa.bit_size == 32) {
7873             if (ctx->program->chip_class >= GFX8)
7874                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
7875             else
7876                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
7877          } else if (instr->dest.ssa.bit_size == 64) {
7878             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7879             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7880             if (ctx->program->chip_class >= GFX8) {
7881                lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7882                hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7883             } else {
7884                lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
7885                hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
7886             }
7887             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7888             emit_split_vector(ctx, dst, 2);
7889          } else {
7890             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7891             nir_print_instr(&instr->instr, stderr);
7892             fprintf(stderr, "\n");
7893          }
7894       }
7895       break;
7896    }
7897    case nir_intrinsic_quad_swap_horizontal:
7898    case nir_intrinsic_quad_swap_vertical:
7899    case nir_intrinsic_quad_swap_diagonal:
7900    case nir_intrinsic_quad_swizzle_amd: {
7901       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7902       if (!nir_dest_is_divergent(instr->dest)) {
7903          emit_uniform_subgroup(ctx, instr, src);
7904          break;
7905       }
7906       uint16_t dpp_ctrl = 0;
7907       switch (instr->intrinsic) {
7908       case nir_intrinsic_quad_swap_horizontal:
7909          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
7910          break;
7911       case nir_intrinsic_quad_swap_vertical:
7912          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
7913          break;
7914       case nir_intrinsic_quad_swap_diagonal:
7915          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
7916          break;
7917       case nir_intrinsic_quad_swizzle_amd:
7918          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
7919          break;
7920       default:
7921          break;
7922       }
7923       if (ctx->program->chip_class < GFX8)
7924          dpp_ctrl |= (1 << 15);
7925
7926       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7927       if (instr->dest.ssa.bit_size == 1) {
7928          assert(src.regClass() == bld.lm);
7929          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7930          if (ctx->program->chip_class >= GFX8)
7931             src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7932          else
7933             src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7934          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7935          emit_wqm(ctx, tmp, dst);
7936       } else if (instr->dest.ssa.bit_size == 8) {
7937          Temp tmp = bld.tmp(v1);
7938          if (ctx->program->chip_class >= GFX8)
7939             emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7940          else
7941             emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7942          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7943       } else if (instr->dest.ssa.bit_size == 16) {
7944          Temp tmp = bld.tmp(v1);
7945          if (ctx->program->chip_class >= GFX8)
7946             emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7947          else
7948             emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7949          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7950       } else if (instr->dest.ssa.bit_size == 32) {
7951          Temp tmp;
7952          if (ctx->program->chip_class >= GFX8)
7953             tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7954          else
7955             tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7956          emit_wqm(ctx, tmp, dst);
7957       } else if (instr->dest.ssa.bit_size == 64) {
7958          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7959          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7960          if (ctx->program->chip_class >= GFX8) {
7961             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7962             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7963          } else {
7964             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
7965             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
7966          }
7967          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7968          emit_split_vector(ctx, dst, 2);
7969       } else {
7970          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7971          nir_print_instr(&instr->instr, stderr);
7972          fprintf(stderr, "\n");
7973       }
7974       break;
7975    }
7976    case nir_intrinsic_masked_swizzle_amd: {
7977       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7978       if (!nir_dest_is_divergent(instr->dest)) {
7979          emit_uniform_subgroup(ctx, instr, src);
7980          break;
7981       }
7982       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7983       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
7984       if (instr->dest.ssa.bit_size == 1) {
7985          assert(src.regClass() == bld.lm);
7986          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7987          src = emit_masked_swizzle(ctx, bld, src, mask);
7988          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7989          emit_wqm(ctx, tmp, dst);
7990       } else if (dst.regClass() == v1b) {
7991          Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
7992          emit_extract_vector(ctx, tmp, 0, dst);
7993       } else if (dst.regClass() == v2b) {
7994          Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
7995          emit_extract_vector(ctx, tmp, 0, dst);
7996       } else if (dst.regClass() == v1) {
7997          emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst);
7998       } else if (dst.regClass() == v2) {
7999          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
8000          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
8001          lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
8002          hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
8003          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8004          emit_split_vector(ctx, dst, 2);
8005       } else {
8006          fprintf(stderr, "Unimplemented NIR instr bit size: ");
8007          nir_print_instr(&instr->instr, stderr);
8008          fprintf(stderr, "\n");
8009       }
8010       break;
8011    }
8012    case nir_intrinsic_write_invocation_amd: {
8013       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
8014       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
8015       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
8016       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8017       if (dst.regClass() == v1) {
8018          /* src2 is ignored for writelane. RA assigns the same reg for dst */
8019          emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
8020       } else if (dst.regClass() == v2) {
8021          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8022          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8023          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8024          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8025          Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8026          Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8027          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8028          emit_split_vector(ctx, dst, 2);
8029       } else {
8030          fprintf(stderr, "Unimplemented NIR instr bit size: ");
8031          nir_print_instr(&instr->instr, stderr);
8032          fprintf(stderr, "\n");
8033       }
8034       break;
8035    }
8036    case nir_intrinsic_mbcnt_amd: {
8037       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8038       RegClass rc = RegClass(src.type(), 1);
8039       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
8040       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
8041       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8042       Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi));
8043       emit_wqm(ctx, wqm_tmp, dst);
8044       break;
8045    }
8046    case nir_intrinsic_load_helper_invocation: {
8047       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8048       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
8049       ctx->block->kind |= block_kind_needs_lowering;
8050       ctx->program->needs_exact = true;
8051       break;
8052    }
8053    case nir_intrinsic_is_helper_invocation: {
8054       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8055       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
8056       ctx->block->kind |= block_kind_needs_lowering;
8057       ctx->program->needs_exact = true;
8058       break;
8059    }
8060    case nir_intrinsic_demote:
8061       bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u));
8062
8063       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8064          ctx->cf_info.exec_potentially_empty_discard = true;
8065       ctx->block->kind |= block_kind_uses_demote;
8066       ctx->program->needs_exact = true;
8067       break;
8068    case nir_intrinsic_demote_if: {
8069       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8070       assert(src.regClass() == bld.lm);
8071       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8072       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8073
8074       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8075          ctx->cf_info.exec_potentially_empty_discard = true;
8076       ctx->block->kind |= block_kind_uses_demote;
8077       ctx->program->needs_exact = true;
8078       break;
8079    }
8080    case nir_intrinsic_first_invocation: {
8081       emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8082                get_ssa_temp(ctx, &instr->dest.ssa));
8083       break;
8084    }
8085    case nir_intrinsic_shader_clock: {
8086       aco_opcode opcode =
8087          nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
8088             aco_opcode::s_memrealtime : aco_opcode::s_memtime;
8089       bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
8090       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
8091       break;
8092    }
8093    case nir_intrinsic_load_vertex_id_zero_base: {
8094       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8095       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8096       break;
8097    }
8098    case nir_intrinsic_load_first_vertex: {
8099       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8100       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8101       break;
8102    }
8103    case nir_intrinsic_load_base_instance: {
8104       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8105       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8106       break;
8107    }
8108    case nir_intrinsic_load_instance_id: {
8109       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8110       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8111       break;
8112    }
8113    case nir_intrinsic_load_draw_id: {
8114       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8115       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8116       break;
8117    }
8118    case nir_intrinsic_load_invocation_id: {
8119       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8120
8121       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8122          if (ctx->options->chip_class >= GFX10)
8123             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8124          else
8125             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8126       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8127          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
8128                   get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
8129       } else {
8130          unreachable("Unsupported stage for load_invocation_id");
8131       }
8132
8133       break;
8134    }
8135    case nir_intrinsic_load_primitive_id: {
8136       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8137
8138       switch (ctx->shader->info.stage) {
8139       case MESA_SHADER_GEOMETRY:
8140          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8141          break;
8142       case MESA_SHADER_TESS_CTRL:
8143          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8144          break;
8145       case MESA_SHADER_TESS_EVAL:
8146          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8147          break;
8148       default:
8149          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8150       }
8151
8152       break;
8153    }
8154    case nir_intrinsic_load_patch_vertices_in: {
8155       assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8156              ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8157
8158       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8159       bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices));
8160       break;
8161    }
8162    case nir_intrinsic_emit_vertex_with_counter: {
8163       visit_emit_vertex_with_counter(ctx, instr);
8164       break;
8165    }
8166    case nir_intrinsic_end_primitive_with_counter: {
8167       unsigned stream = nir_intrinsic_stream_id(instr);
8168       bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
8169       break;
8170    }
8171    case nir_intrinsic_set_vertex_count: {
8172       /* unused, the HW keeps track of this for us */
8173       break;
8174    }
8175    default:
8176       fprintf(stderr, "Unimplemented intrinsic instr: ");
8177       nir_print_instr(&instr->instr, stderr);
8178       fprintf(stderr, "\n");
8179       abort();
8180
8181       break;
8182    }
8183 }
8184
8185
8186 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
8187                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
8188                     enum glsl_base_type *stype)
8189 {
8190    nir_deref_instr *texture_deref_instr = NULL;
8191    nir_deref_instr *sampler_deref_instr = NULL;
8192    int plane = -1;
8193
8194    for (unsigned i = 0; i < instr->num_srcs; i++) {
8195       switch (instr->src[i].src_type) {
8196       case nir_tex_src_texture_deref:
8197          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
8198          break;
8199       case nir_tex_src_sampler_deref:
8200          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
8201          break;
8202       case nir_tex_src_plane:
8203          plane = nir_src_as_int(instr->src[i].src);
8204          break;
8205       default:
8206          break;
8207       }
8208    }
8209
8210    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
8211
8212    if (!sampler_deref_instr)
8213       sampler_deref_instr = texture_deref_instr;
8214
8215    if (plane >= 0) {
8216       assert(instr->op != nir_texop_txf_ms &&
8217              instr->op != nir_texop_samples_identical);
8218       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
8219       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
8220    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
8221       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
8222    } else if (instr->op == nir_texop_fragment_mask_fetch) {
8223       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8224    } else {
8225       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
8226    }
8227    if (samp_ptr) {
8228       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
8229
8230       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
8231          /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
8232          Builder bld(ctx->program, ctx->block);
8233
8234          /* to avoid unnecessary moves, we split and recombine sampler and image */
8235          Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
8236                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8237          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8238          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
8239                     Definition(img[2]), Definition(img[3]), Definition(img[4]),
8240                     Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
8241          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
8242                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
8243
8244          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
8245          *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
8246                                img[0], img[1], img[2], img[3],
8247                                img[4], img[5], img[6], img[7]);
8248          *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
8249                                 samp[0], samp[1], samp[2], samp[3]);
8250       }
8251    }
8252    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
8253                      instr->op == nir_texop_samples_identical))
8254       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8255 }
8256
8257 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
8258                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
8259 {
8260    Builder bld(ctx->program, ctx->block);
8261
8262    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
8263    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
8264    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
8265
8266    Operand neg_one(0xbf800000u);
8267    Operand one(0x3f800000u);
8268    Operand two(0x40000000u);
8269    Operand four(0x40800000u);
8270
8271    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
8272    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
8273    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
8274
8275    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
8276    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
8277    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
8278    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
8279
8280    // select sc
8281    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
8282    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
8283                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
8284                        one, is_ma_y);
8285    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8286
8287    // select tc
8288    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
8289    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
8290    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8291
8292    // select ma
8293    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8294                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
8295                   deriv_z, is_ma_z);
8296    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
8297    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
8298 }
8299
8300 void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
8301 {
8302    Builder bld(ctx->program, ctx->block);
8303    Temp ma, tc, sc, id;
8304
8305    if (is_array) {
8306       coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
8307
8308       // see comment in ac_prepare_cube_coords()
8309       if (ctx->options->chip_class <= GFX8)
8310          coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
8311    }
8312
8313    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8314
8315    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
8316    vop3a->operands[0] = Operand(ma);
8317    vop3a->abs[0] = true;
8318    Temp invma = bld.tmp(v1);
8319    vop3a->definitions[0] = Definition(invma);
8320    ctx->block->instructions.emplace_back(std::move(vop3a));
8321
8322    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8323    if (!is_deriv)
8324       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
8325
8326    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8327    if (!is_deriv)
8328       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
8329
8330    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8331
8332    if (is_deriv) {
8333       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
8334       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
8335
8336       for (unsigned i = 0; i < 2; i++) {
8337          // see comment in ac_prepare_cube_coords()
8338          Temp deriv_ma;
8339          Temp deriv_sc, deriv_tc;
8340          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
8341                            &deriv_ma, &deriv_sc, &deriv_tc);
8342
8343          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
8344
8345          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8346                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
8347                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
8348          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8349                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
8350                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
8351          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
8352       }
8353
8354       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
8355       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
8356    }
8357
8358    if (is_array)
8359       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
8360    coords.resize(3);
8361    coords[0] = sc;
8362    coords[1] = tc;
8363    coords[2] = id;
8364 }
8365
8366 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
8367 {
8368    if (vec->parent_instr->type != nir_instr_type_alu)
8369       return;
8370    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
8371    if (vec_instr->op != nir_op_vec(vec->num_components))
8372       return;
8373
8374    for (unsigned i = 0; i < vec->num_components; i++) {
8375       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
8376               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
8377    }
8378 }
8379
8380 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
8381 {
8382    Builder bld(ctx->program, ctx->block);
8383    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
8384         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
8385         has_clamped_lod = false;
8386    Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
8387         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
8388         clamped_lod = Temp();
8389    std::vector<Temp> coords;
8390    std::vector<Temp> derivs;
8391    nir_const_value *sample_index_cv = NULL;
8392    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
8393    enum glsl_base_type stype;
8394    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
8395
8396    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
8397                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
8398    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
8399                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
8400
8401    for (unsigned i = 0; i < instr->num_srcs; i++) {
8402       switch (instr->src[i].src_type) {
8403       case nir_tex_src_coord: {
8404          Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
8405          for (unsigned i = 0; i < coord.size(); i++)
8406             coords.emplace_back(emit_extract_vector(ctx, coord, i, v1));
8407          break;
8408       }
8409       case nir_tex_src_bias:
8410          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
8411          has_bias = true;
8412          break;
8413       case nir_tex_src_lod: {
8414          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
8415
8416          if (val && val->f32 <= 0.0) {
8417             level_zero = true;
8418          } else {
8419             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8420             has_lod = true;
8421          }
8422          break;
8423       }
8424       case nir_tex_src_min_lod:
8425          clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8426          has_clamped_lod = true;
8427          break;
8428       case nir_tex_src_comparator:
8429          if (instr->is_shadow) {
8430             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
8431             has_compare = true;
8432          }
8433          break;
8434       case nir_tex_src_offset:
8435          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
8436          get_const_vec(instr->src[i].src.ssa, const_offset);
8437          has_offset = true;
8438          break;
8439       case nir_tex_src_ddx:
8440          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
8441          has_ddx = true;
8442          break;
8443       case nir_tex_src_ddy:
8444          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
8445          has_ddy = true;
8446          break;
8447       case nir_tex_src_ms_index:
8448          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
8449          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
8450          has_sample_index = true;
8451          break;
8452       case nir_tex_src_texture_offset:
8453       case nir_tex_src_sampler_offset:
8454       default:
8455          break;
8456       }
8457    }
8458
8459    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
8460       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
8461
8462    if (instr->op == nir_texop_texture_samples) {
8463       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
8464
8465       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
8466       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
8467       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
8468
8469       Operand default_sample = Operand(1u);
8470       if (ctx->options->robust_buffer_access) {
8471          /* Extract the second dword of the descriptor, if it's
8472           * all zero, then it's a null descriptor.
8473           */
8474          Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
8475          Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
8476          default_sample = Operand(is_non_null_descriptor);
8477       }
8478
8479       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
8480       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8481                samples, default_sample, bld.scc(is_msaa));
8482       return;
8483    }
8484
8485    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
8486       aco_ptr<Instruction> tmp_instr;
8487       Temp acc, pack = Temp();
8488
8489       uint32_t pack_const = 0;
8490       for (unsigned i = 0; i < offset.size(); i++) {
8491          if (!const_offset[i])
8492             continue;
8493          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
8494       }
8495
8496       if (offset.type() == RegType::sgpr) {
8497          for (unsigned i = 0; i < offset.size(); i++) {
8498             if (const_offset[i])
8499                continue;
8500
8501             acc = emit_extract_vector(ctx, offset, i, s1);
8502             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
8503
8504             if (i) {
8505                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
8506             }
8507
8508             if (pack == Temp()) {
8509                pack = acc;
8510             } else {
8511                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
8512             }
8513          }
8514
8515          if (pack_const && pack != Temp())
8516             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
8517       } else {
8518          for (unsigned i = 0; i < offset.size(); i++) {
8519             if (const_offset[i])
8520                continue;
8521
8522             acc = emit_extract_vector(ctx, offset, i, v1);
8523             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
8524
8525             if (i) {
8526                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
8527             }
8528
8529             if (pack == Temp()) {
8530                pack = acc;
8531             } else {
8532                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
8533             }
8534          }
8535
8536          if (pack_const && pack != Temp())
8537             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
8538       }
8539       if (pack_const && pack == Temp())
8540          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
8541       else if (pack == Temp())
8542          has_offset = false;
8543       else
8544          offset = pack;
8545    }
8546
8547    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
8548       prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
8549
8550    /* pack derivatives */
8551    if (has_ddx || has_ddy) {
8552       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
8553          assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
8554          Temp zero = bld.copy(bld.def(v1), Operand(0u));
8555          derivs = {ddx, zero, ddy, zero};
8556       } else {
8557          for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
8558             derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
8559          for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
8560             derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
8561       }
8562       has_derivs = true;
8563    }
8564
8565    if (instr->coord_components > 1 &&
8566        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8567        instr->is_array &&
8568        instr->op != nir_texop_txf)
8569       coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
8570
8571    if (instr->coord_components > 2 &&
8572       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
8573        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8574        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
8575        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8576        instr->is_array &&
8577        instr->op != nir_texop_txf &&
8578        instr->op != nir_texop_txf_ms &&
8579        instr->op != nir_texop_fragment_fetch &&
8580        instr->op != nir_texop_fragment_mask_fetch)
8581       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
8582
8583    if (ctx->options->chip_class == GFX9 &&
8584        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8585        instr->op != nir_texop_lod && instr->coord_components) {
8586       assert(coords.size() > 0 && coords.size() < 3);
8587
8588       coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
8589                                                                      Operand((uint32_t) 0) :
8590                                                                      Operand((uint32_t) 0x3f000000)));
8591    }
8592
8593    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
8594
8595    if (instr->op == nir_texop_samples_identical)
8596       resource = fmask_ptr;
8597
8598    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8599              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8600             instr->op != nir_texop_txs &&
8601             instr->op != nir_texop_fragment_fetch &&
8602             instr->op != nir_texop_fragment_mask_fetch) {
8603       assert(has_sample_index);
8604       Operand op(sample_index);
8605       if (sample_index_cv)
8606          op = Operand(sample_index_cv->u32);
8607       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
8608    }
8609
8610    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
8611       for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
8612          Temp off = emit_extract_vector(ctx, offset, i, v1);
8613          coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
8614       }
8615       has_offset = false;
8616    }
8617
8618    /* Build tex instruction */
8619    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
8620    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
8621                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
8622                   : 0;
8623    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8624    Temp tmp_dst = dst;
8625
8626    /* gather4 selects the component by dmask and always returns vec4 */
8627    if (instr->op == nir_texop_tg4) {
8628       assert(instr->dest.ssa.num_components == 4);
8629       if (instr->is_shadow)
8630          dmask = 1;
8631       else
8632          dmask = 1 << instr->component;
8633       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
8634          tmp_dst = bld.tmp(v4);
8635    } else if (instr->op == nir_texop_samples_identical) {
8636       tmp_dst = bld.tmp(v1);
8637    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
8638       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
8639    }
8640
8641    aco_ptr<MIMG_instruction> tex;
8642    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
8643       if (!has_lod)
8644          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8645
8646       bool div_by_6 = instr->op == nir_texop_txs &&
8647                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
8648                       instr->is_array &&
8649                       (dmask & (1 << 2));
8650       if (tmp_dst.id() == dst.id() && div_by_6)
8651          tmp_dst = bld.tmp(tmp_dst.regClass());
8652
8653       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8654       tex->operands[0] = Operand(resource);
8655       tex->operands[1] = Operand(s4); /* no sampler */
8656       tex->operands[2] = Operand(as_vgpr(ctx,lod));
8657       if (ctx->options->chip_class == GFX9 &&
8658           instr->op == nir_texop_txs &&
8659           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8660           instr->is_array) {
8661          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
8662       } else if (instr->op == nir_texop_query_levels) {
8663          tex->dmask = 1 << 3;
8664       } else {
8665          tex->dmask = dmask;
8666       }
8667       tex->da = da;
8668       tex->definitions[0] = Definition(tmp_dst);
8669       tex->dim = dim;
8670       tex->can_reorder = true;
8671       ctx->block->instructions.emplace_back(std::move(tex));
8672
8673       if (div_by_6) {
8674          /* divide 3rd value by 6 by multiplying with magic number */
8675          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8676          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
8677          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
8678          assert(instr->dest.ssa.num_components == 3);
8679          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
8680          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8681                               emit_extract_vector(ctx, tmp_dst, 0, v1),
8682                               emit_extract_vector(ctx, tmp_dst, 1, v1),
8683                               by_6);
8684
8685       }
8686
8687       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8688       return;
8689    }
8690
8691    Temp tg4_compare_cube_wa64 = Temp();
8692
8693    if (tg4_integer_workarounds) {
8694       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8695       tex->operands[0] = Operand(resource);
8696       tex->operands[1] = Operand(s4); /* no sampler */
8697       tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8698       tex->dim = dim;
8699       tex->dmask = 0x3;
8700       tex->da = da;
8701       Temp size = bld.tmp(v2);
8702       tex->definitions[0] = Definition(size);
8703       tex->can_reorder = true;
8704       ctx->block->instructions.emplace_back(std::move(tex));
8705       emit_split_vector(ctx, size, size.size());
8706
8707       Temp half_texel[2];
8708       for (unsigned i = 0; i < 2; i++) {
8709          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
8710          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
8711          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
8712          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
8713       }
8714
8715       Temp new_coords[2] = {
8716          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
8717          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
8718       };
8719
8720       if (tg4_integer_cube_workaround) {
8721          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
8722          Temp desc[resource.size()];
8723          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
8724                                                                            Format::PSEUDO, 1, resource.size())};
8725          split->operands[0] = Operand(resource);
8726          for (unsigned i = 0; i < resource.size(); i++) {
8727             desc[i] = bld.tmp(s1);
8728             split->definitions[i] = Definition(desc[i]);
8729          }
8730          ctx->block->instructions.emplace_back(std::move(split));
8731
8732          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
8733          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
8734                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
8735
8736          Temp nfmt;
8737          if (stype == GLSL_TYPE_UINT) {
8738             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8739                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
8740                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
8741                             bld.scc(compare_cube_wa));
8742          } else {
8743             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8744                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
8745                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
8746                             bld.scc(compare_cube_wa));
8747          }
8748          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
8749          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
8750
8751          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
8752
8753          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
8754                             Operand((uint32_t)C_008F14_NUM_FORMAT));
8755          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
8756
8757          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
8758                                                                          Format::PSEUDO, resource.size(), 1)};
8759          for (unsigned i = 0; i < resource.size(); i++)
8760             vec->operands[i] = Operand(desc[i]);
8761          resource = bld.tmp(resource.regClass());
8762          vec->definitions[0] = Definition(resource);
8763          ctx->block->instructions.emplace_back(std::move(vec));
8764
8765          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8766                                   new_coords[0], coords[0], tg4_compare_cube_wa64);
8767          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8768                                   new_coords[1], coords[1], tg4_compare_cube_wa64);
8769       }
8770       coords[0] = new_coords[0];
8771       coords[1] = new_coords[1];
8772    }
8773
8774    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
8775       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
8776
8777       assert(coords.size() == 1);
8778       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
8779       aco_opcode op;
8780       switch (last_bit) {
8781       case 1:
8782          op = aco_opcode::buffer_load_format_x; break;
8783       case 2:
8784          op = aco_opcode::buffer_load_format_xy; break;
8785       case 3:
8786          op = aco_opcode::buffer_load_format_xyz; break;
8787       case 4:
8788          op = aco_opcode::buffer_load_format_xyzw; break;
8789       default:
8790          unreachable("Tex instruction loads more than 4 components.");
8791       }
8792
8793       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
8794       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
8795          tmp_dst = dst;
8796       else
8797          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
8798
8799       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
8800       mubuf->operands[0] = Operand(resource);
8801       mubuf->operands[1] = Operand(coords[0]);
8802       mubuf->operands[2] = Operand((uint32_t) 0);
8803       mubuf->definitions[0] = Definition(tmp_dst);
8804       mubuf->idxen = true;
8805       mubuf->can_reorder = true;
8806       ctx->block->instructions.emplace_back(std::move(mubuf));
8807
8808       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
8809       return;
8810    }
8811
8812    /* gather MIMG address components */
8813    std::vector<Temp> args;
8814    if (has_offset)
8815       args.emplace_back(offset);
8816    if (has_bias)
8817       args.emplace_back(bias);
8818    if (has_compare)
8819       args.emplace_back(compare);
8820    if (has_derivs)
8821       args.insert(args.end(), derivs.begin(), derivs.end());
8822
8823    args.insert(args.end(), coords.begin(), coords.end());
8824    if (has_sample_index)
8825       args.emplace_back(sample_index);
8826    if (has_lod)
8827       args.emplace_back(lod);
8828    if (has_clamped_lod)
8829       args.emplace_back(clamped_lod);
8830
8831    Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
8832    aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
8833    vec->definitions[0] = Definition(arg);
8834    for (unsigned i = 0; i < args.size(); i++)
8835       vec->operands[i] = Operand(args[i]);
8836    ctx->block->instructions.emplace_back(std::move(vec));
8837
8838
8839    if (instr->op == nir_texop_txf ||
8840        instr->op == nir_texop_txf_ms ||
8841        instr->op == nir_texop_samples_identical ||
8842        instr->op == nir_texop_fragment_fetch ||
8843        instr->op == nir_texop_fragment_mask_fetch) {
8844       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
8845       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
8846       tex->operands[0] = Operand(resource);
8847       tex->operands[1] = Operand(s4); /* no sampler */
8848       tex->operands[2] = Operand(arg);
8849       tex->dim = dim;
8850       tex->dmask = dmask;
8851       tex->unrm = true;
8852       tex->da = da;
8853       tex->definitions[0] = Definition(tmp_dst);
8854       tex->can_reorder = true;
8855       ctx->block->instructions.emplace_back(std::move(tex));
8856
8857       if (instr->op == nir_texop_samples_identical) {
8858          assert(dmask == 1 && dst.regClass() == v1);
8859          assert(dst.id() != tmp_dst.id());
8860
8861          Temp tmp = bld.tmp(bld.lm);
8862          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
8863          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
8864
8865       } else {
8866          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8867       }
8868       return;
8869    }
8870
8871    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
8872    aco_opcode opcode = aco_opcode::image_sample;
8873    if (has_offset) { /* image_sample_*_o */
8874       if (has_clamped_lod) {
8875          if (has_compare) {
8876             opcode = aco_opcode::image_sample_c_cl_o;
8877             if (has_derivs)
8878                opcode = aco_opcode::image_sample_c_d_cl_o;
8879             if (has_bias)
8880                opcode = aco_opcode::image_sample_c_b_cl_o;
8881          } else {
8882             opcode = aco_opcode::image_sample_cl_o;
8883             if (has_derivs)
8884                opcode = aco_opcode::image_sample_d_cl_o;
8885             if (has_bias)
8886                opcode = aco_opcode::image_sample_b_cl_o;
8887          }
8888       } else if (has_compare) {
8889          opcode = aco_opcode::image_sample_c_o;
8890          if (has_derivs)
8891             opcode = aco_opcode::image_sample_c_d_o;
8892          if (has_bias)
8893             opcode = aco_opcode::image_sample_c_b_o;
8894          if (level_zero)
8895             opcode = aco_opcode::image_sample_c_lz_o;
8896          if (has_lod)
8897             opcode = aco_opcode::image_sample_c_l_o;
8898       } else {
8899          opcode = aco_opcode::image_sample_o;
8900          if (has_derivs)
8901             opcode = aco_opcode::image_sample_d_o;
8902          if (has_bias)
8903             opcode = aco_opcode::image_sample_b_o;
8904          if (level_zero)
8905             opcode = aco_opcode::image_sample_lz_o;
8906          if (has_lod)
8907             opcode = aco_opcode::image_sample_l_o;
8908       }
8909    } else if (has_clamped_lod) { /* image_sample_*_cl */
8910       if (has_compare) {
8911          opcode = aco_opcode::image_sample_c_cl;
8912          if (has_derivs)
8913             opcode = aco_opcode::image_sample_c_d_cl;
8914          if (has_bias)
8915             opcode = aco_opcode::image_sample_c_b_cl;
8916       } else {
8917          opcode = aco_opcode::image_sample_cl;
8918          if (has_derivs)
8919             opcode = aco_opcode::image_sample_d_cl;
8920          if (has_bias)
8921             opcode = aco_opcode::image_sample_b_cl;
8922       }
8923    } else { /* no offset */
8924       if (has_compare) {
8925          opcode = aco_opcode::image_sample_c;
8926          if (has_derivs)
8927             opcode = aco_opcode::image_sample_c_d;
8928          if (has_bias)
8929             opcode = aco_opcode::image_sample_c_b;
8930          if (level_zero)
8931             opcode = aco_opcode::image_sample_c_lz;
8932          if (has_lod)
8933             opcode = aco_opcode::image_sample_c_l;
8934       } else {
8935          opcode = aco_opcode::image_sample;
8936          if (has_derivs)
8937             opcode = aco_opcode::image_sample_d;
8938          if (has_bias)
8939             opcode = aco_opcode::image_sample_b;
8940          if (level_zero)
8941             opcode = aco_opcode::image_sample_lz;
8942          if (has_lod)
8943             opcode = aco_opcode::image_sample_l;
8944       }
8945    }
8946
8947    if (instr->op == nir_texop_tg4) {
8948       if (has_offset) { /* image_gather4_*_o */
8949          if (has_compare) {
8950             opcode = aco_opcode::image_gather4_c_lz_o;
8951             if (has_lod)
8952                opcode = aco_opcode::image_gather4_c_l_o;
8953             if (has_bias)
8954                opcode = aco_opcode::image_gather4_c_b_o;
8955          } else {
8956             opcode = aco_opcode::image_gather4_lz_o;
8957             if (has_lod)
8958                opcode = aco_opcode::image_gather4_l_o;
8959             if (has_bias)
8960                opcode = aco_opcode::image_gather4_b_o;
8961          }
8962       } else {
8963          if (has_compare) {
8964             opcode = aco_opcode::image_gather4_c_lz;
8965             if (has_lod)
8966                opcode = aco_opcode::image_gather4_c_l;
8967             if (has_bias)
8968                opcode = aco_opcode::image_gather4_c_b;
8969          } else {
8970             opcode = aco_opcode::image_gather4_lz;
8971             if (has_lod)
8972                opcode = aco_opcode::image_gather4_l;
8973             if (has_bias)
8974                opcode = aco_opcode::image_gather4_b;
8975          }
8976       }
8977    } else if (instr->op == nir_texop_lod) {
8978       opcode = aco_opcode::image_get_lod;
8979    }
8980
8981    /* we don't need the bias, sample index, compare value or offset to be
8982     * computed in WQM but if the p_create_vector copies the coordinates, then it
8983     * needs to be in WQM */
8984    if (ctx->stage == fragment_fs &&
8985        !has_derivs && !has_lod && !level_zero &&
8986        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
8987        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
8988       arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
8989
8990    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
8991    tex->operands[0] = Operand(resource);
8992    tex->operands[1] = Operand(sampler);
8993    tex->operands[2] = Operand(arg);
8994    tex->dim = dim;
8995    tex->dmask = dmask;
8996    tex->da = da;
8997    tex->definitions[0] = Definition(tmp_dst);
8998    tex->can_reorder = true;
8999    ctx->block->instructions.emplace_back(std::move(tex));
9000
9001    if (tg4_integer_cube_workaround) {
9002       assert(tmp_dst.id() != dst.id());
9003       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
9004
9005       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
9006       Temp val[4];
9007       for (unsigned i = 0; i < dst.size(); i++) {
9008          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
9009          Temp cvt_val;
9010          if (stype == GLSL_TYPE_UINT)
9011             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
9012          else
9013             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
9014          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
9015       }
9016       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
9017       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
9018                            val[0], val[1], val[2], val[3]);
9019    }
9020    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
9021    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9022
9023 }
9024
9025
9026 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
9027 {
9028    Temp tmp = get_ssa_temp(ctx, ssa);
9029    if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9030       return Operand(rc);
9031    } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
9032       if (ctx->program->wave_size == 64)
9033          return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
9034       else
9035          return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
9036    } else {
9037       return Operand(tmp);
9038    }
9039 }
9040
9041 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
9042 {
9043    aco_ptr<Pseudo_instruction> phi;
9044    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9045    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9046
9047    bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9048    logical |= ctx->block->kind & block_kind_merge;
9049    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9050
9051    /* we want a sorted list of sources, since the predecessor list is also sorted */
9052    std::map<unsigned, nir_ssa_def*> phi_src;
9053    nir_foreach_phi_src(src, instr)
9054       phi_src[src->pred->index] = src->src.ssa;
9055
9056    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9057    unsigned num_operands = 0;
9058    Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1];
9059    unsigned num_defined = 0;
9060    unsigned cur_pred_idx = 0;
9061    for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
9062       if (cur_pred_idx < preds.size()) {
9063          /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
9064          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9065          unsigned skipped = 0;
9066          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9067             skipped++;
9068          if (cur_pred_idx + skipped < preds.size()) {
9069             for (unsigned i = 0; i < skipped; i++)
9070                operands[num_operands++] = Operand(dst.regClass());
9071             cur_pred_idx += skipped;
9072          } else {
9073             continue;
9074          }
9075       }
9076       /* Handle missing predecessors at the end. This shouldn't happen with loop
9077        * headers and we can't ignore these sources for loop header phis. */
9078       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9079          continue;
9080       cur_pred_idx++;
9081       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9082       operands[num_operands++] = op;
9083       num_defined += !op.isUndefined();
9084    }
9085    /* handle block_kind_continue_or_break at loop exit blocks */
9086    while (cur_pred_idx++ < preds.size())
9087       operands[num_operands++] = Operand(dst.regClass());
9088
9089    /* If the loop ends with a break, still add a linear continue edge in case
9090     * that break is divergent or continue_or_break is used. We'll either remove
9091     * this operand later in visit_loop() if it's not necessary or replace the
9092     * undef with something correct. */
9093    if (!logical && ctx->block->kind & block_kind_loop_header) {
9094       nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9095       nir_block *last = nir_loop_last_block(loop);
9096       if (last->successors[0] != instr->instr.block)
9097          operands[num_operands++] = Operand(RegClass());
9098    }
9099
9100    if (num_defined == 0) {
9101       Builder bld(ctx->program, ctx->block);
9102       if (dst.regClass() == s1) {
9103          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
9104       } else if (dst.regClass() == v1) {
9105          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
9106       } else {
9107          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9108          for (unsigned i = 0; i < dst.size(); i++)
9109             vec->operands[i] = Operand(0u);
9110          vec->definitions[0] = Definition(dst);
9111          ctx->block->instructions.emplace_back(std::move(vec));
9112       }
9113       return;
9114    }
9115
9116    /* we can use a linear phi in some cases if one src is undef */
9117    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9118       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
9119
9120       Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9121       Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9122       assert(invert->kind & block_kind_invert);
9123
9124       unsigned then_block = invert->linear_preds[0];
9125
9126       Block* insert_block = NULL;
9127       for (unsigned i = 0; i < num_operands; i++) {
9128          Operand op = operands[i];
9129          if (op.isUndefined())
9130             continue;
9131          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9132          phi->operands[0] = op;
9133          break;
9134       }
9135       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9136       phi->operands[1] = Operand(dst.regClass());
9137       phi->definitions[0] = Definition(dst);
9138       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9139       return;
9140    }
9141
9142    /* try to scalarize vector phis */
9143    if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
9144       // TODO: scalarize linear phis on divergent ifs
9145       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
9146       std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
9147       for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
9148          Operand src = operands[i];
9149          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
9150             can_scalarize = false;
9151       }
9152       if (can_scalarize) {
9153          unsigned num_components = instr->dest.ssa.num_components;
9154          assert(dst.size() % num_components == 0);
9155          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
9156
9157          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
9158          for (unsigned k = 0; k < num_components; k++) {
9159             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9160             for (unsigned i = 0; i < num_operands; i++) {
9161                Operand src = operands[i];
9162                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
9163             }
9164             Temp phi_dst = {ctx->program->allocateId(), rc};
9165             phi->definitions[0] = Definition(phi_dst);
9166             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9167             new_vec[k] = phi_dst;
9168             vec->operands[k] = Operand(phi_dst);
9169          }
9170          vec->definitions[0] = Definition(dst);
9171          ctx->block->instructions.emplace_back(std::move(vec));
9172          ctx->allocated_vec.emplace(dst.id(), new_vec);
9173          return;
9174       }
9175    }
9176
9177    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9178    for (unsigned i = 0; i < num_operands; i++)
9179       phi->operands[i] = operands[i];
9180    phi->definitions[0] = Definition(dst);
9181    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9182 }
9183
9184
9185 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
9186 {
9187    Temp dst = get_ssa_temp(ctx, &instr->def);
9188
9189    assert(dst.type() == RegType::sgpr);
9190
9191    if (dst.size() == 1) {
9192       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
9193    } else {
9194       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9195       for (unsigned i = 0; i < dst.size(); i++)
9196          vec->operands[i] = Operand(0u);
9197       vec->definitions[0] = Definition(dst);
9198       ctx->block->instructions.emplace_back(std::move(vec));
9199    }
9200 }
9201
9202 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
9203 {
9204    Builder bld(ctx->program, ctx->block);
9205    Block *logical_target;
9206    append_logical_end(ctx->block);
9207    unsigned idx = ctx->block->index;
9208
9209    switch (instr->type) {
9210    case nir_jump_break:
9211       logical_target = ctx->cf_info.parent_loop.exit;
9212       add_logical_edge(idx, logical_target);
9213       ctx->block->kind |= block_kind_break;
9214
9215       if (!ctx->cf_info.parent_if.is_divergent &&
9216           !ctx->cf_info.parent_loop.has_divergent_continue) {
9217          /* uniform break - directly jump out of the loop */
9218          ctx->block->kind |= block_kind_uniform;
9219          ctx->cf_info.has_branch = true;
9220          bld.branch(aco_opcode::p_branch);
9221          add_linear_edge(idx, logical_target);
9222          return;
9223       }
9224       ctx->cf_info.parent_loop.has_divergent_branch = true;
9225       ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9226       break;
9227    case nir_jump_continue:
9228       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9229       add_logical_edge(idx, logical_target);
9230       ctx->block->kind |= block_kind_continue;
9231
9232       if (ctx->cf_info.parent_if.is_divergent) {
9233          /* for potential uniform breaks after this continue,
9234             we must ensure that they are handled correctly */
9235          ctx->cf_info.parent_loop.has_divergent_continue = true;
9236          ctx->cf_info.parent_loop.has_divergent_branch = true;
9237          ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9238       } else {
9239          /* uniform continue - directly jump to the loop header */
9240          ctx->block->kind |= block_kind_uniform;
9241          ctx->cf_info.has_branch = true;
9242          bld.branch(aco_opcode::p_branch);
9243          add_linear_edge(idx, logical_target);
9244          return;
9245       }
9246       break;
9247    default:
9248       fprintf(stderr, "Unknown NIR jump instr: ");
9249       nir_print_instr(&instr->instr, stderr);
9250       fprintf(stderr, "\n");
9251       abort();
9252    }
9253
9254    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
9255       ctx->cf_info.exec_potentially_empty_break = true;
9256       ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth;
9257    }
9258
9259    /* remove critical edges from linear CFG */
9260    bld.branch(aco_opcode::p_branch);
9261    Block* break_block = ctx->program->create_and_insert_block();
9262    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9263    break_block->kind |= block_kind_uniform;
9264    add_linear_edge(idx, break_block);
9265    /* the loop_header pointer might be invalidated by this point */
9266    if (instr->type == nir_jump_continue)
9267       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9268    add_linear_edge(break_block->index, logical_target);
9269    bld.reset(break_block);
9270    bld.branch(aco_opcode::p_branch);
9271
9272    Block* continue_block = ctx->program->create_and_insert_block();
9273    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9274    add_linear_edge(idx, continue_block);
9275    append_logical_start(continue_block);
9276    ctx->block = continue_block;
9277    return;
9278 }
9279
9280 void visit_block(isel_context *ctx, nir_block *block)
9281 {
9282    nir_foreach_instr(instr, block) {
9283       switch (instr->type) {
9284       case nir_instr_type_alu:
9285          visit_alu_instr(ctx, nir_instr_as_alu(instr));
9286          break;
9287       case nir_instr_type_load_const:
9288          visit_load_const(ctx, nir_instr_as_load_const(instr));
9289          break;
9290       case nir_instr_type_intrinsic:
9291          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
9292          break;
9293       case nir_instr_type_tex:
9294          visit_tex(ctx, nir_instr_as_tex(instr));
9295          break;
9296       case nir_instr_type_phi:
9297          visit_phi(ctx, nir_instr_as_phi(instr));
9298          break;
9299       case nir_instr_type_ssa_undef:
9300          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
9301          break;
9302       case nir_instr_type_deref:
9303          break;
9304       case nir_instr_type_jump:
9305          visit_jump(ctx, nir_instr_as_jump(instr));
9306          break;
9307       default:
9308          fprintf(stderr, "Unknown NIR instr type: ");
9309          nir_print_instr(instr, stderr);
9310          fprintf(stderr, "\n");
9311          //abort();
9312       }
9313    }
9314
9315    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9316       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
9317 }
9318
9319
9320
9321 static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last,
9322                                     aco_ptr<Instruction>& header_phi, Operand *vals)
9323 {
9324    vals[0] = Operand(header_phi->definitions[0].getTemp());
9325    RegClass rc = vals[0].regClass();
9326
9327    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
9328
9329    unsigned next_pred = 1;
9330
9331    for (unsigned idx = first + 1; idx <= last; idx++) {
9332       Block& block = ctx->program->blocks[idx];
9333       if (block.loop_nest_depth != loop_nest_depth) {
9334          vals[idx - first] = vals[idx - 1 - first];
9335          continue;
9336       }
9337
9338       if (block.kind & block_kind_continue) {
9339          vals[idx - first] = header_phi->operands[next_pred];
9340          next_pred++;
9341          continue;
9342       }
9343
9344       bool all_same = true;
9345       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
9346          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
9347
9348       Operand val;
9349       if (all_same) {
9350          val = vals[block.linear_preds[0] - first];
9351       } else {
9352          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
9353             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
9354          for (unsigned i = 0; i < block.linear_preds.size(); i++)
9355             phi->operands[i] = vals[block.linear_preds[i] - first];
9356          val = Operand(Temp(ctx->program->allocateId(), rc));
9357          phi->definitions[0] = Definition(val.getTemp());
9358          block.instructions.emplace(block.instructions.begin(), std::move(phi));
9359       }
9360       vals[idx - first] = val;
9361    }
9362
9363    return vals[last - first];
9364 }
9365
9366 static void visit_loop(isel_context *ctx, nir_loop *loop)
9367 {
9368    //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9369    append_logical_end(ctx->block);
9370    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9371    Builder bld(ctx->program, ctx->block);
9372    bld.branch(aco_opcode::p_branch);
9373    unsigned loop_preheader_idx = ctx->block->index;
9374
9375    Block loop_exit = Block();
9376    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9377    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9378
9379    Block* loop_header = ctx->program->create_and_insert_block();
9380    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
9381    loop_header->kind |= block_kind_loop_header;
9382    add_edge(loop_preheader_idx, loop_header);
9383    ctx->block = loop_header;
9384
9385    /* emit loop body */
9386    unsigned loop_header_idx = loop_header->index;
9387    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
9388    append_logical_start(ctx->block);
9389    bool unreachable = visit_cf_list(ctx, &loop->body);
9390
9391    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
9392    if (!ctx->cf_info.has_branch) {
9393       append_logical_end(ctx->block);
9394       if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
9395          /* Discards can result in code running with an empty exec mask.
9396           * This would result in divergent breaks not ever being taken. As a
9397           * workaround, break the loop when the loop mask is empty instead of
9398           * always continuing. */
9399          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9400          unsigned block_idx = ctx->block->index;
9401
9402          /* create helper blocks to avoid critical edges */
9403          Block *break_block = ctx->program->create_and_insert_block();
9404          break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9405          break_block->kind = block_kind_uniform;
9406          bld.reset(break_block);
9407          bld.branch(aco_opcode::p_branch);
9408          add_linear_edge(block_idx, break_block);
9409          add_linear_edge(break_block->index, &loop_exit);
9410
9411          Block *continue_block = ctx->program->create_and_insert_block();
9412          continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9413          continue_block->kind = block_kind_uniform;
9414          bld.reset(continue_block);
9415          bld.branch(aco_opcode::p_branch);
9416          add_linear_edge(block_idx, continue_block);
9417          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9418
9419          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9420             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9421          ctx->block = &ctx->program->blocks[block_idx];
9422       } else {
9423          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9424          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9425             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9426          else
9427             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9428       }
9429
9430       bld.reset(ctx->block);
9431       bld.branch(aco_opcode::p_branch);
9432    }
9433
9434    /* Fixup phis in loop header from unreachable blocks.
9435     * has_branch/has_divergent_branch also indicates if the loop ends with a
9436     * break/continue instruction, but we don't emit those if unreachable=true */
9437    if (unreachable) {
9438       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
9439       bool linear = ctx->cf_info.has_branch;
9440       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
9441       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9442          if ((logical && instr->opcode == aco_opcode::p_phi) ||
9443              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
9444             /* the last operand should be the one that needs to be removed */
9445             instr->operands.pop_back();
9446          } else if (!is_phi(instr)) {
9447             break;
9448          }
9449       }
9450    }
9451
9452    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
9453     * and the previous one shouldn't both happen at once because a break in the
9454     * merge block would get CSE'd */
9455    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
9456       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
9457       Operand vals[num_vals];
9458       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9459          if (instr->opcode == aco_opcode::p_linear_phi) {
9460             if (ctx->cf_info.has_branch)
9461                instr->operands.pop_back();
9462             else
9463                instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
9464          } else if (!is_phi(instr)) {
9465             break;
9466          }
9467       }
9468    }
9469
9470    ctx->cf_info.has_branch = false;
9471
9472    // TODO: if the loop has not a single exit, we must add one °°
9473    /* emit loop successor block */
9474    ctx->block = ctx->program->insert_block(std::move(loop_exit));
9475    append_logical_start(ctx->block);
9476
9477    #if 0
9478    // TODO: check if it is beneficial to not branch on continues
9479    /* trim linear phis in loop header */
9480    for (auto&& instr : loop_entry->instructions) {
9481       if (instr->opcode == aco_opcode::p_linear_phi) {
9482          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
9483          new_phi->definitions[0] = instr->definitions[0];
9484          for (unsigned i = 0; i < new_phi->operands.size(); i++)
9485             new_phi->operands[i] = instr->operands[i];
9486          /* check that the remaining operands are all the same */
9487          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
9488             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
9489          instr.swap(new_phi);
9490       } else if (instr->opcode == aco_opcode::p_phi) {
9491          continue;
9492       } else {
9493          break;
9494       }
9495    }
9496    #endif
9497 }
9498
9499 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
9500 {
9501    ic->cond = cond;
9502
9503    append_logical_end(ctx->block);
9504    ctx->block->kind |= block_kind_branch;
9505
9506    /* branch to linear then block */
9507    assert(cond.regClass() == ctx->program->lane_mask);
9508    aco_ptr<Pseudo_branch_instruction> branch;
9509    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
9510    branch->operands[0] = Operand(cond);
9511    ctx->block->instructions.push_back(std::move(branch));
9512
9513    ic->BB_if_idx = ctx->block->index;
9514    ic->BB_invert = Block();
9515    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9516    /* Invert blocks are intentionally not marked as top level because they
9517     * are not part of the logical cfg. */
9518    ic->BB_invert.kind |= block_kind_invert;
9519    ic->BB_endif = Block();
9520    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9521    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
9522
9523    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
9524    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
9525    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
9526    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
9527    ctx->cf_info.parent_if.is_divergent = true;
9528
9529    /* divergent branches use cbranch_execz */
9530    ctx->cf_info.exec_potentially_empty_discard = false;
9531    ctx->cf_info.exec_potentially_empty_break = false;
9532    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9533
9534    /** emit logical then block */
9535    Block* BB_then_logical = ctx->program->create_and_insert_block();
9536    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9537    add_edge(ic->BB_if_idx, BB_then_logical);
9538    ctx->block = BB_then_logical;
9539    append_logical_start(BB_then_logical);
9540 }
9541
9542 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
9543 {
9544    Block *BB_then_logical = ctx->block;
9545    append_logical_end(BB_then_logical);
9546     /* branch from logical then block to invert block */
9547    aco_ptr<Pseudo_branch_instruction> branch;
9548    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9549    BB_then_logical->instructions.emplace_back(std::move(branch));
9550    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
9551    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9552       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
9553    BB_then_logical->kind |= block_kind_uniform;
9554    assert(!ctx->cf_info.has_branch);
9555    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9556    ctx->cf_info.parent_loop.has_divergent_branch = false;
9557
9558    /** emit linear then block */
9559    Block* BB_then_linear = ctx->program->create_and_insert_block();
9560    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9561    BB_then_linear->kind |= block_kind_uniform;
9562    add_linear_edge(ic->BB_if_idx, BB_then_linear);
9563    /* branch from linear then block to invert block */
9564    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9565    BB_then_linear->instructions.emplace_back(std::move(branch));
9566    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
9567
9568    /** emit invert merge block */
9569    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
9570    ic->invert_idx = ctx->block->index;
9571
9572    /* branch to linear else block (skip else) */
9573    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
9574    branch->operands[0] = Operand(ic->cond);
9575    ctx->block->instructions.push_back(std::move(branch));
9576
9577    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
9578    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
9579    ic->exec_potentially_empty_break_depth_old =
9580       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9581    /* divergent branches use cbranch_execz */
9582    ctx->cf_info.exec_potentially_empty_discard = false;
9583    ctx->cf_info.exec_potentially_empty_break = false;
9584    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9585
9586    /** emit logical else block */
9587    Block* BB_else_logical = ctx->program->create_and_insert_block();
9588    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9589    add_logical_edge(ic->BB_if_idx, BB_else_logical);
9590    add_linear_edge(ic->invert_idx, BB_else_logical);
9591    ctx->block = BB_else_logical;
9592    append_logical_start(BB_else_logical);
9593 }
9594
9595 static void end_divergent_if(isel_context *ctx, if_context *ic)
9596 {
9597    Block *BB_else_logical = ctx->block;
9598    append_logical_end(BB_else_logical);
9599
9600    /* branch from logical else block to endif block */
9601    aco_ptr<Pseudo_branch_instruction> branch;
9602    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9603    BB_else_logical->instructions.emplace_back(std::move(branch));
9604    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
9605    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9606       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
9607    BB_else_logical->kind |= block_kind_uniform;
9608
9609    assert(!ctx->cf_info.has_branch);
9610    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9611
9612
9613    /** emit linear else block */
9614    Block* BB_else_linear = ctx->program->create_and_insert_block();
9615    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9616    BB_else_linear->kind |= block_kind_uniform;
9617    add_linear_edge(ic->invert_idx, BB_else_linear);
9618
9619    /* branch from linear else block to endif block */
9620    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9621    BB_else_linear->instructions.emplace_back(std::move(branch));
9622    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
9623
9624
9625    /** emit endif merge block */
9626    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9627    append_logical_start(ctx->block);
9628
9629
9630    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
9631    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
9632    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
9633    ctx->cf_info.exec_potentially_empty_break_depth =
9634       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9635    if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
9636        !ctx->cf_info.parent_if.is_divergent) {
9637       ctx->cf_info.exec_potentially_empty_break = false;
9638       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9639    }
9640    /* uniform control flow never has an empty exec-mask */
9641    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
9642       ctx->cf_info.exec_potentially_empty_discard = false;
9643       ctx->cf_info.exec_potentially_empty_break = false;
9644       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9645    }
9646 }
9647
9648 static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
9649 {
9650    assert(cond.regClass() == s1);
9651
9652    append_logical_end(ctx->block);
9653    ctx->block->kind |= block_kind_uniform;
9654
9655    aco_ptr<Pseudo_branch_instruction> branch;
9656    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
9657    branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
9658    branch->operands[0] = Operand(cond);
9659    branch->operands[0].setFixed(scc);
9660    ctx->block->instructions.emplace_back(std::move(branch));
9661
9662    ic->BB_if_idx = ctx->block->index;
9663    ic->BB_endif = Block();
9664    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9665    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
9666
9667    ctx->cf_info.has_branch = false;
9668    ctx->cf_info.parent_loop.has_divergent_branch = false;
9669
9670    /** emit then block */
9671    Block* BB_then = ctx->program->create_and_insert_block();
9672    BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9673    add_edge(ic->BB_if_idx, BB_then);
9674    append_logical_start(BB_then);
9675    ctx->block = BB_then;
9676 }
9677
9678 static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
9679 {
9680    Block *BB_then = ctx->block;
9681
9682    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
9683    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9684
9685    if (!ic->uniform_has_then_branch) {
9686       append_logical_end(BB_then);
9687       /* branch from then block to endif block */
9688       aco_ptr<Pseudo_branch_instruction> branch;
9689       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9690       BB_then->instructions.emplace_back(std::move(branch));
9691       add_linear_edge(BB_then->index, &ic->BB_endif);
9692       if (!ic->then_branch_divergent)
9693          add_logical_edge(BB_then->index, &ic->BB_endif);
9694       BB_then->kind |= block_kind_uniform;
9695    }
9696
9697    ctx->cf_info.has_branch = false;
9698    ctx->cf_info.parent_loop.has_divergent_branch = false;
9699
9700    /** emit else block */
9701    Block* BB_else = ctx->program->create_and_insert_block();
9702    BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9703    add_edge(ic->BB_if_idx, BB_else);
9704    append_logical_start(BB_else);
9705    ctx->block = BB_else;
9706 }
9707
9708 static void end_uniform_if(isel_context *ctx, if_context *ic)
9709 {
9710    Block *BB_else = ctx->block;
9711
9712    if (!ctx->cf_info.has_branch) {
9713       append_logical_end(BB_else);
9714       /* branch from then block to endif block */
9715       aco_ptr<Pseudo_branch_instruction> branch;
9716       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9717       BB_else->instructions.emplace_back(std::move(branch));
9718       add_linear_edge(BB_else->index, &ic->BB_endif);
9719       if (!ctx->cf_info.parent_loop.has_divergent_branch)
9720          add_logical_edge(BB_else->index, &ic->BB_endif);
9721       BB_else->kind |= block_kind_uniform;
9722    }
9723
9724    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
9725    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9726
9727    /** emit endif merge block */
9728    if (!ctx->cf_info.has_branch) {
9729       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9730       append_logical_start(ctx->block);
9731    }
9732 }
9733
9734 static bool visit_if(isel_context *ctx, nir_if *if_stmt)
9735 {
9736    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
9737    Builder bld(ctx->program, ctx->block);
9738    aco_ptr<Pseudo_branch_instruction> branch;
9739    if_context ic;
9740
9741    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
9742       /**
9743        * Uniform conditionals are represented in the following way*) :
9744        *
9745        * The linear and logical CFG:
9746        *                        BB_IF
9747        *                        /    \
9748        *       BB_THEN (logical)      BB_ELSE (logical)
9749        *                        \    /
9750        *                        BB_ENDIF
9751        *
9752        * *) Exceptions may be due to break and continue statements within loops
9753        *    If a break/continue happens within uniform control flow, it branches
9754        *    to the loop exit/entry block. Otherwise, it branches to the next
9755        *    merge block.
9756        **/
9757
9758       // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
9759       assert(cond.regClass() == ctx->program->lane_mask);
9760       cond = bool_to_scalar_condition(ctx, cond);
9761
9762       begin_uniform_if_then(ctx, &ic, cond);
9763       visit_cf_list(ctx, &if_stmt->then_list);
9764
9765       begin_uniform_if_else(ctx, &ic);
9766       visit_cf_list(ctx, &if_stmt->else_list);
9767
9768       end_uniform_if(ctx, &ic);
9769    } else { /* non-uniform condition */
9770       /**
9771        * To maintain a logical and linear CFG without critical edges,
9772        * non-uniform conditionals are represented in the following way*) :
9773        *
9774        * The linear CFG:
9775        *                        BB_IF
9776        *                        /    \
9777        *       BB_THEN (logical)      BB_THEN (linear)
9778        *                        \    /
9779        *                        BB_INVERT (linear)
9780        *                        /    \
9781        *       BB_ELSE (logical)      BB_ELSE (linear)
9782        *                        \    /
9783        *                        BB_ENDIF
9784        *
9785        * The logical CFG:
9786        *                        BB_IF
9787        *                        /    \
9788        *       BB_THEN (logical)      BB_ELSE (logical)
9789        *                        \    /
9790        *                        BB_ENDIF
9791        *
9792        * *) Exceptions may be due to break and continue statements within loops
9793        **/
9794
9795       begin_divergent_if_then(ctx, &ic, cond);
9796       visit_cf_list(ctx, &if_stmt->then_list);
9797
9798       begin_divergent_if_else(ctx, &ic);
9799       visit_cf_list(ctx, &if_stmt->else_list);
9800
9801       end_divergent_if(ctx, &ic);
9802    }
9803
9804    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
9805 }
9806
9807 static bool visit_cf_list(isel_context *ctx,
9808                           struct exec_list *list)
9809 {
9810    foreach_list_typed(nir_cf_node, node, node, list) {
9811       switch (node->type) {
9812       case nir_cf_node_block:
9813          visit_block(ctx, nir_cf_node_as_block(node));
9814          break;
9815       case nir_cf_node_if:
9816          if (!visit_if(ctx, nir_cf_node_as_if(node)))
9817             return true;
9818          break;
9819       case nir_cf_node_loop:
9820          visit_loop(ctx, nir_cf_node_as_loop(node));
9821          break;
9822       default:
9823          unreachable("unimplemented cf list type");
9824       }
9825    }
9826    return false;
9827 }
9828
9829 static void create_null_export(isel_context *ctx)
9830 {
9831    /* Some shader stages always need to have exports.
9832     * So when there is none, we need to add a null export.
9833     */
9834
9835    unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
9836    bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10;
9837    Builder bld(ctx->program, ctx->block);
9838    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
9839            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm);
9840 }
9841
9842 static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
9843 {
9844    assert(ctx->stage == vertex_vs ||
9845           ctx->stage == tess_eval_vs ||
9846           ctx->stage == gs_copy_vs ||
9847           ctx->stage == ngg_vertex_gs ||
9848           ctx->stage == ngg_tess_eval_gs);
9849
9850    int offset = (ctx->stage & sw_tes)
9851                 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
9852                 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
9853    uint64_t mask = ctx->outputs.mask[slot];
9854    if (!is_pos && !mask)
9855       return false;
9856    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
9857       return false;
9858    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9859    exp->enabled_mask = mask;
9860    for (unsigned i = 0; i < 4; ++i) {
9861       if (mask & (1 << i))
9862          exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
9863       else
9864          exp->operands[i] = Operand(v1);
9865    }
9866    /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
9867     * Setting valid_mask=1 prevents it and has no other effect.
9868     */
9869    exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
9870    exp->done = false;
9871    exp->compressed = false;
9872    if (is_pos)
9873       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9874    else
9875       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
9876    ctx->block->instructions.emplace_back(std::move(exp));
9877
9878    return true;
9879 }
9880
9881 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
9882 {
9883    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9884    exp->enabled_mask = 0;
9885    for (unsigned i = 0; i < 4; ++i)
9886       exp->operands[i] = Operand(v1);
9887    if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
9888       exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
9889       exp->enabled_mask |= 0x1;
9890    }
9891    if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
9892       exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
9893       exp->enabled_mask |= 0x4;
9894    }
9895    if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
9896       if (ctx->options->chip_class < GFX9) {
9897          exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
9898          exp->enabled_mask |= 0x8;
9899       } else {
9900          Builder bld(ctx->program, ctx->block);
9901
9902          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
9903                              Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
9904          if (exp->operands[2].isTemp())
9905             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
9906
9907          exp->operands[2] = Operand(out);
9908          exp->enabled_mask |= 0x4;
9909       }
9910    }
9911    exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
9912    exp->done = false;
9913    exp->compressed = false;
9914    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9915    ctx->block->instructions.emplace_back(std::move(exp));
9916 }
9917
9918 static void create_export_phis(isel_context *ctx)
9919 {
9920    /* Used when exports are needed, but the output temps are defined in a preceding block.
9921     * This function will set up phis in order to access the outputs in the next block.
9922     */
9923
9924    assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start);
9925    aco_ptr<Instruction> logical_start = aco_ptr<Instruction>(ctx->block->instructions.back().release());
9926    ctx->block->instructions.pop_back();
9927
9928    Builder bld(ctx->program, ctx->block);
9929
9930    for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) {
9931       uint64_t mask = ctx->outputs.mask[slot];
9932       for (unsigned i = 0; i < 4; ++i) {
9933          if (!(mask & (1 << i)))
9934             continue;
9935
9936          Temp old = ctx->outputs.temps[slot * 4 + i];
9937          Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1));
9938          ctx->outputs.temps[slot * 4 + i] = phi;
9939       }
9940    }
9941
9942    bld.insert(std::move(logical_start));
9943 }
9944
9945 static void create_vs_exports(isel_context *ctx)
9946 {
9947    assert(ctx->stage == vertex_vs ||
9948           ctx->stage == tess_eval_vs ||
9949           ctx->stage == gs_copy_vs ||
9950           ctx->stage == ngg_vertex_gs ||
9951           ctx->stage == ngg_tess_eval_gs);
9952
9953    radv_vs_output_info *outinfo = (ctx->stage & sw_tes)
9954                                   ? &ctx->program->info->tes.outinfo
9955                                   : &ctx->program->info->vs.outinfo;
9956
9957    if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
9958       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
9959       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
9960    }
9961
9962    if (ctx->options->key.has_multiview_view_index) {
9963       ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
9964       ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
9965    }
9966
9967    /* the order these position exports are created is important */
9968    int next_pos = 0;
9969    bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
9970    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
9971       export_vs_psiz_layer_viewport(ctx, &next_pos);
9972       exported_pos = true;
9973    }
9974    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9975       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
9976    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9977       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
9978
9979    if (ctx->export_clip_dists) {
9980       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9981          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
9982       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9983          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
9984    }
9985
9986    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
9987       if (i < VARYING_SLOT_VAR0 &&
9988           i != VARYING_SLOT_LAYER &&
9989           i != VARYING_SLOT_PRIMITIVE_ID &&
9990           i != VARYING_SLOT_VIEWPORT)
9991          continue;
9992
9993       export_vs_varying(ctx, i, false, NULL);
9994    }
9995
9996    if (!exported_pos)
9997       create_null_export(ctx);
9998 }
9999
10000 static bool export_fs_mrt_z(isel_context *ctx)
10001 {
10002    Builder bld(ctx->program, ctx->block);
10003    unsigned enabled_channels = 0;
10004    bool compr = false;
10005    Operand values[4];
10006
10007    for (unsigned i = 0; i < 4; ++i) {
10008       values[i] = Operand(v1);
10009    }
10010
10011    /* Both stencil and sample mask only need 16-bits. */
10012    if (!ctx->program->info->ps.writes_z &&
10013        (ctx->program->info->ps.writes_stencil ||
10014         ctx->program->info->ps.writes_sample_mask)) {
10015       compr = true; /* COMPR flag */
10016
10017       if (ctx->program->info->ps.writes_stencil) {
10018          /* Stencil should be in X[23:16]. */
10019          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10020          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
10021          enabled_channels |= 0x3;
10022       }
10023
10024       if (ctx->program->info->ps.writes_sample_mask) {
10025          /* SampleMask should be in Y[15:0]. */
10026          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10027          enabled_channels |= 0xc;
10028      }
10029    } else {
10030       if (ctx->program->info->ps.writes_z) {
10031          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10032          enabled_channels |= 0x1;
10033       }
10034
10035       if (ctx->program->info->ps.writes_stencil) {
10036          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10037          enabled_channels |= 0x2;
10038       }
10039
10040       if (ctx->program->info->ps.writes_sample_mask) {
10041          values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10042          enabled_channels |= 0x4;
10043       }
10044    }
10045
10046    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10047     * writemask component.
10048     */
10049    if (ctx->options->chip_class == GFX6 &&
10050        ctx->options->family != CHIP_OLAND &&
10051        ctx->options->family != CHIP_HAINAN) {
10052             enabled_channels |= 0x1;
10053    }
10054
10055    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
10056            enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
10057
10058    return true;
10059 }
10060
10061 static bool export_fs_mrt_color(isel_context *ctx, int slot)
10062 {
10063    Builder bld(ctx->program, ctx->block);
10064    unsigned write_mask = ctx->outputs.mask[slot];
10065    Operand values[4];
10066
10067    for (unsigned i = 0; i < 4; ++i) {
10068       if (write_mask & (1 << i)) {
10069          values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10070       } else {
10071          values[i] = Operand(v1);
10072       }
10073    }
10074
10075    unsigned target, col_format;
10076    unsigned enabled_channels = 0;
10077    aco_opcode compr_op = (aco_opcode)0;
10078
10079    slot -= FRAG_RESULT_DATA0;
10080    target = V_008DFC_SQ_EXP_MRT + slot;
10081    col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
10082
10083    bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
10084    bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
10085    bool is_16bit = values[0].regClass() == v2b;
10086
10087    switch (col_format)
10088    {
10089    case V_028714_SPI_SHADER_ZERO:
10090       enabled_channels = 0; /* writemask */
10091       target = V_008DFC_SQ_EXP_NULL;
10092       break;
10093
10094    case V_028714_SPI_SHADER_32_R:
10095       enabled_channels = 1;
10096       break;
10097
10098    case V_028714_SPI_SHADER_32_GR:
10099       enabled_channels = 0x3;
10100       break;
10101
10102    case V_028714_SPI_SHADER_32_AR:
10103       if (ctx->options->chip_class >= GFX10) {
10104          /* Special case: on GFX10, the outputs are different for 32_AR */
10105          enabled_channels = 0x3;
10106          values[1] = values[3];
10107          values[3] = Operand(v1);
10108       } else {
10109          enabled_channels = 0x9;
10110       }
10111       break;
10112
10113    case V_028714_SPI_SHADER_FP16_ABGR:
10114       enabled_channels = 0x5;
10115       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
10116       if (is_16bit) {
10117          if (ctx->options->chip_class >= GFX9) {
10118             /* Pack the FP16 values together instead of converting them to
10119              * FP32 and back to FP16.
10120              * TODO: use p_create_vector and let the compiler optimizes.
10121              */
10122             compr_op = aco_opcode::v_pack_b32_f16;
10123          } else {
10124             for (unsigned i = 0; i < 4; i++) {
10125                if ((write_mask >> i) & 1)
10126                   values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
10127             }
10128          }
10129       }
10130       break;
10131
10132    case V_028714_SPI_SHADER_UNORM16_ABGR:
10133       enabled_channels = 0x5;
10134       if (is_16bit && ctx->options->chip_class >= GFX9) {
10135          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10136       } else {
10137          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10138       }
10139       break;
10140
10141    case V_028714_SPI_SHADER_SNORM16_ABGR:
10142       enabled_channels = 0x5;
10143       if (is_16bit && ctx->options->chip_class >= GFX9) {
10144          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10145       } else {
10146          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10147       }
10148       break;
10149
10150    case V_028714_SPI_SHADER_UINT16_ABGR: {
10151       enabled_channels = 0x5;
10152       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10153       if (is_int8 || is_int10) {
10154          /* clamp */
10155          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10156          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10157
10158          for (unsigned i = 0; i < 4; i++) {
10159             if ((write_mask >> i) & 1) {
10160                values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10161                                     i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
10162                                     values[i]);
10163             }
10164          }
10165       } else if (is_16bit) {
10166          for (unsigned i = 0; i < 4; i++) {
10167             if ((write_mask >> i) & 1) {
10168                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10169                values[i] = Operand(tmp);
10170             }
10171          }
10172       }
10173       break;
10174    }
10175
10176    case V_028714_SPI_SHADER_SINT16_ABGR:
10177       enabled_channels = 0x5;
10178       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10179       if (is_int8 || is_int10) {
10180          /* clamp */
10181          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10182          uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
10183          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10184          Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
10185
10186          for (unsigned i = 0; i < 4; i++) {
10187             if ((write_mask >> i) & 1) {
10188                values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
10189                                     i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
10190                                     values[i]);
10191                values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
10192                                     i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
10193                                     values[i]);
10194             }
10195          }
10196       } else if (is_16bit) {
10197          for (unsigned i = 0; i < 4; i++) {
10198             if ((write_mask >> i) & 1) {
10199                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10200                values[i] = Operand(tmp);
10201             }
10202          }
10203       }
10204       break;
10205
10206    case V_028714_SPI_SHADER_32_ABGR:
10207       enabled_channels = 0xF;
10208       break;
10209
10210    default:
10211       break;
10212    }
10213
10214    if (target == V_008DFC_SQ_EXP_NULL)
10215       return false;
10216
10217    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10218    if (ctx->options->enable_mrt_output_nan_fixup &&
10219        !is_16bit &&
10220        (col_format == V_028714_SPI_SHADER_32_R ||
10221         col_format == V_028714_SPI_SHADER_32_GR ||
10222         col_format == V_028714_SPI_SHADER_32_AR ||
10223         col_format == V_028714_SPI_SHADER_32_ABGR ||
10224         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10225       for (int i = 0; i < 4; i++) {
10226          if (!(write_mask & (1 << i)))
10227             continue;
10228
10229          Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
10230                                bld.hint_vcc(bld.def(bld.lm)), values[i],
10231                                bld.copy(bld.def(v1), Operand(3u)));
10232          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10233                               bld.copy(bld.def(v1), Operand(0u)), isnan);
10234       }
10235    }
10236
10237    if ((bool) compr_op) {
10238       for (int i = 0; i < 2; i++) {
10239          /* check if at least one of the values to be compressed is enabled */
10240          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
10241          if (enabled) {
10242             enabled_channels |= enabled << (i*2);
10243             values[i] = bld.vop3(compr_op, bld.def(v1),
10244                                  values[i*2].isUndefined() ? Operand(0u) : values[i*2],
10245                                  values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
10246          } else {
10247             values[i] = Operand(v1);
10248          }
10249       }
10250       values[2] = Operand(v1);
10251       values[3] = Operand(v1);
10252    } else {
10253       for (int i = 0; i < 4; i++)
10254          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10255    }
10256
10257    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
10258            enabled_channels, target, (bool) compr_op);
10259    return true;
10260 }
10261
10262 static void create_fs_exports(isel_context *ctx)
10263 {
10264    bool exported = false;
10265
10266    /* Export depth, stencil and sample mask. */
10267    if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
10268        ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
10269        ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10270       exported |= export_fs_mrt_z(ctx);
10271
10272    /* Export all color render targets. */
10273    for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
10274       if (ctx->outputs.mask[i])
10275          exported |= export_fs_mrt_color(ctx, i);
10276
10277    if (!exported)
10278       create_null_export(ctx);
10279 }
10280
10281 static void write_tcs_tess_factors(isel_context *ctx)
10282 {
10283    unsigned outer_comps;
10284    unsigned inner_comps;
10285
10286    switch (ctx->args->options->key.tcs.primitive_mode) {
10287    case GL_ISOLINES:
10288       outer_comps = 2;
10289       inner_comps = 0;
10290       break;
10291    case GL_TRIANGLES:
10292       outer_comps = 3;
10293       inner_comps = 1;
10294       break;
10295    case GL_QUADS:
10296       outer_comps = 4;
10297       inner_comps = 2;
10298       break;
10299    default:
10300       return;
10301    }
10302
10303    Builder bld(ctx->program, ctx->block);
10304
10305    bld.barrier(aco_opcode::p_memory_barrier_shared);
10306    if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
10307       bld.sopp(aco_opcode::s_barrier);
10308
10309    Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
10310    Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
10311
10312    Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id);
10313    if_context ic_invocation_id_is_zero;
10314    begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero);
10315    bld.reset(ctx->block);
10316
10317    Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u));
10318
10319    std::pair<Temp, unsigned> lds_base = get_tcs_output_lds_offset(ctx);
10320    unsigned stride = inner_comps + outer_comps;
10321    unsigned lds_align = calculate_lds_alignment(ctx, lds_base.second);
10322    Temp tf_inner_vec;
10323    Temp tf_outer_vec;
10324    Temp out[6];
10325    assert(stride <= (sizeof(out) / sizeof(Temp)));
10326
10327    if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
10328       // LINES reversal
10329       tf_outer_vec = load_lds(ctx, 4, bld.tmp(v2), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10330       out[1] = emit_extract_vector(ctx, tf_outer_vec, 0, v1);
10331       out[0] = emit_extract_vector(ctx, tf_outer_vec, 1, v1);
10332    } else {
10333       tf_outer_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, outer_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10334       tf_inner_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, inner_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_in_loc, lds_align);
10335
10336       for (unsigned i = 0; i < outer_comps; ++i)
10337          out[i] = emit_extract_vector(ctx, tf_outer_vec, i, v1);
10338       for (unsigned i = 0; i < inner_comps; ++i)
10339          out[outer_comps + i] = emit_extract_vector(ctx, tf_inner_vec, i, v1);
10340    }
10341
10342    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
10343    Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset);
10344    Temp byte_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, stride * 4u);
10345    unsigned tf_const_offset = 0;
10346
10347    if (ctx->program->chip_class <= GFX8) {
10348       Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id);
10349       if_context ic_rel_patch_id_is_zero;
10350       begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero);
10351       bld.reset(ctx->block);
10352
10353       /* Store the dynamic HS control word. */
10354       Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
10355       bld.mubuf(aco_opcode::buffer_store_dword,
10356                 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
10357                 /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
10358                 /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
10359       tf_const_offset += 4;
10360
10361       begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
10362       end_divergent_if(ctx, &ic_rel_patch_id_is_zero);
10363       bld.reset(ctx->block);
10364    }
10365
10366    assert(stride == 2 || stride == 4 || stride == 6);
10367    Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
10368    store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
10369
10370    /* Store to offchip for TES to read - only if TES reads them */
10371    if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
10372       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
10373       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
10374
10375       std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
10376       store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
10377
10378       if (likely(inner_comps)) {
10379          std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
10380          store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
10381       }
10382    }
10383
10384    begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
10385    end_divergent_if(ctx, &ic_invocation_id_is_zero);
10386 }
10387
10388 static void emit_stream_output(isel_context *ctx,
10389                                Temp const *so_buffers,
10390                                Temp const *so_write_offset,
10391                                const struct radv_stream_output *output)
10392 {
10393    unsigned num_comps = util_bitcount(output->component_mask);
10394    unsigned writemask = (1 << num_comps) - 1;
10395    unsigned loc = output->location;
10396    unsigned buf = output->buffer;
10397
10398    assert(num_comps && num_comps <= 4);
10399    if (!num_comps || num_comps > 4)
10400       return;
10401
10402    unsigned start = ffs(output->component_mask) - 1;
10403
10404    Temp out[4];
10405    bool all_undef = true;
10406    assert(ctx->stage & hw_vs);
10407    for (unsigned i = 0; i < num_comps; i++) {
10408       out[i] = ctx->outputs.temps[loc * 4 + start + i];
10409       all_undef = all_undef && !out[i].id();
10410    }
10411    if (all_undef)
10412       return;
10413
10414    while (writemask) {
10415       int start, count;
10416       u_bit_scan_consecutive_range(&writemask, &start, &count);
10417       if (count == 3 && ctx->options->chip_class == GFX6) {
10418          /* GFX6 doesn't support storing vec3, split it. */
10419          writemask |= 1u << (start + 2);
10420          count = 2;
10421       }
10422
10423       unsigned offset = output->offset + start * 4;
10424
10425       Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
10426       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
10427       for (int i = 0; i < count; ++i)
10428          vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
10429       vec->definitions[0] = Definition(write_data);
10430       ctx->block->instructions.emplace_back(std::move(vec));
10431
10432       aco_opcode opcode;
10433       switch (count) {
10434       case 1:
10435          opcode = aco_opcode::buffer_store_dword;
10436          break;
10437       case 2:
10438          opcode = aco_opcode::buffer_store_dwordx2;
10439          break;
10440       case 3:
10441          opcode = aco_opcode::buffer_store_dwordx3;
10442          break;
10443       case 4:
10444          opcode = aco_opcode::buffer_store_dwordx4;
10445          break;
10446       default:
10447          unreachable("Unsupported dword count.");
10448       }
10449
10450       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
10451       store->operands[0] = Operand(so_buffers[buf]);
10452       store->operands[1] = Operand(so_write_offset[buf]);
10453       store->operands[2] = Operand((uint32_t) 0);
10454       store->operands[3] = Operand(write_data);
10455       if (offset > 4095) {
10456          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
10457          Builder bld(ctx->program, ctx->block);
10458          store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
10459       } else {
10460          store->offset = offset;
10461       }
10462       store->offen = true;
10463       store->glc = true;
10464       store->dlc = false;
10465       store->slc = true;
10466       store->can_reorder = true;
10467       ctx->block->instructions.emplace_back(std::move(store));
10468    }
10469 }
10470
10471 static void emit_streamout(isel_context *ctx, unsigned stream)
10472 {
10473    Builder bld(ctx->program, ctx->block);
10474
10475    Temp so_buffers[4];
10476    Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
10477    for (unsigned i = 0; i < 4; i++) {
10478       unsigned stride = ctx->program->info->so.strides[i];
10479       if (!stride)
10480          continue;
10481
10482       Operand off = bld.copy(bld.def(s1), Operand(i * 16u));
10483       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
10484    }
10485
10486    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10487                                 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
10488
10489    Temp tid = emit_mbcnt(ctx, bld.def(v1));
10490
10491    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
10492
10493    if_context ic;
10494    begin_divergent_if_then(ctx, &ic, can_emit);
10495
10496    bld.reset(ctx->block);
10497
10498    Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
10499
10500    Temp so_write_offset[4];
10501
10502    for (unsigned i = 0; i < 4; i++) {
10503       unsigned stride = ctx->program->info->so.strides[i];
10504       if (!stride)
10505          continue;
10506
10507       if (stride == 1) {
10508          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10509                                 get_arg(ctx, ctx->args->streamout_write_idx),
10510                                 get_arg(ctx, ctx->args->streamout_offset[i]));
10511          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
10512
10513          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
10514       } else {
10515          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
10516          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
10517                                  get_arg(ctx, ctx->args->streamout_offset[i]));
10518          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
10519       }
10520    }
10521
10522    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
10523       struct radv_stream_output *output =
10524          &ctx->program->info->so.outputs[i];
10525       if (stream != output->stream)
10526          continue;
10527
10528       emit_stream_output(ctx, so_buffers, so_write_offset, output);
10529    }
10530
10531    begin_divergent_if_else(ctx, &ic);
10532    end_divergent_if(ctx, &ic);
10533 }
10534
10535 } /* end namespace */
10536
10537 void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
10538 {
10539    assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
10540    Builder bld(ctx->program, ctx->block);
10541    constexpr unsigned hs_idx = 1u;
10542    Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10543                                               get_arg(ctx, ctx->args->merged_wave_info),
10544                                               Operand((8u << 16) | (hs_idx * 8u)));
10545    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
10546
10547    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
10548
10549    Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10550                                get_arg(ctx, ctx->args->rel_auto_id),
10551                                get_arg(ctx, ctx->args->ac.instance_id),
10552                                ls_has_nonzero_hs_threads);
10553    Temp rel_auto_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10554                                get_arg(ctx, ctx->args->ac.tcs_rel_ids),
10555                                get_arg(ctx, ctx->args->rel_auto_id),
10556                                ls_has_nonzero_hs_threads);
10557    Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10558                              get_arg(ctx, ctx->args->ac.tcs_patch_id),
10559                              get_arg(ctx, ctx->args->ac.vertex_id),
10560                              ls_has_nonzero_hs_threads);
10561
10562    ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
10563    ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id;
10564    ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
10565 }
10566
10567 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
10568 {
10569    /* Split all arguments except for the first (ring_offsets) and the last
10570     * (exec) so that the dead channels don't stay live throughout the program.
10571     */
10572    for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
10573       if (startpgm->definitions[i].regClass().size() > 1) {
10574          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
10575                            startpgm->definitions[i].regClass().size());
10576       }
10577    }
10578 }
10579
10580 void handle_bc_optimize(isel_context *ctx)
10581 {
10582    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
10583    Builder bld(ctx->program, ctx->block);
10584    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
10585    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
10586    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
10587    ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
10588    ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
10589    if (uses_center && uses_centroid) {
10590       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
10591                               get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
10592
10593       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
10594          Temp new_coord[2];
10595          for (unsigned i = 0; i < 2; i++) {
10596             Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
10597             Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
10598             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10599                                     persp_centroid, persp_center, sel);
10600          }
10601          ctx->persp_centroid = bld.tmp(v2);
10602          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
10603                     Operand(new_coord[0]), Operand(new_coord[1]));
10604          emit_split_vector(ctx, ctx->persp_centroid, 2);
10605       }
10606
10607       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
10608          Temp new_coord[2];
10609          for (unsigned i = 0; i < 2; i++) {
10610             Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
10611             Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
10612             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10613                                     linear_centroid, linear_center, sel);
10614          }
10615          ctx->linear_centroid = bld.tmp(v2);
10616          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
10617                     Operand(new_coord[0]), Operand(new_coord[1]));
10618          emit_split_vector(ctx, ctx->linear_centroid, 2);
10619       }
10620    }
10621 }
10622
10623 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
10624 {
10625    Program *program = ctx->program;
10626
10627    unsigned float_controls = shader->info.float_controls_execution_mode;
10628
10629    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
10630       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
10631    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
10632       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
10633                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
10634
10635    program->next_fp_mode.must_flush_denorms32 =
10636       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
10637    program->next_fp_mode.must_flush_denorms16_64 =
10638       float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
10639                         FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
10640
10641    program->next_fp_mode.care_about_round32 =
10642       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
10643
10644    program->next_fp_mode.care_about_round16_64 =
10645       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
10646                         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
10647
10648    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
10649     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
10650    if (program->next_fp_mode.must_flush_denorms16_64)
10651       program->next_fp_mode.denorm16_64 = 0;
10652    else
10653       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
10654
10655    /* preserving fp32 denorms is expensive, so only do it if asked */
10656    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
10657       program->next_fp_mode.denorm32 = fp_denorm_keep;
10658    else
10659       program->next_fp_mode.denorm32 = 0;
10660
10661    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
10662       program->next_fp_mode.round32 = fp_round_tz;
10663    else
10664       program->next_fp_mode.round32 = fp_round_ne;
10665
10666    if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
10667       program->next_fp_mode.round16_64 = fp_round_tz;
10668    else
10669       program->next_fp_mode.round16_64 = fp_round_ne;
10670
10671    ctx->block->fp_mode = program->next_fp_mode;
10672 }
10673
10674 void cleanup_cfg(Program *program)
10675 {
10676    /* create linear_succs/logical_succs */
10677    for (Block& BB : program->blocks) {
10678       for (unsigned idx : BB.linear_preds)
10679          program->blocks[idx].linear_succs.emplace_back(BB.index);
10680       for (unsigned idx : BB.logical_preds)
10681          program->blocks[idx].logical_succs.emplace_back(BB.index);
10682    }
10683 }
10684
10685 Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
10686 {
10687    Builder bld(ctx->program, ctx->block);
10688
10689    /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
10690    Temp count = i == 0
10691                 ? get_arg(ctx, ctx->args->merged_wave_info)
10692                 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
10693                            get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u));
10694
10695    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
10696    Temp cond;
10697
10698    if (ctx->program->wave_size == 64) {
10699       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
10700       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
10701       cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
10702    } else {
10703       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
10704       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
10705    }
10706
10707    return cond;
10708 }
10709
10710 bool ngg_early_prim_export(isel_context *ctx)
10711 {
10712    /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */
10713    return true;
10714 }
10715
10716 void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx)
10717 {
10718    Builder bld(ctx->program, ctx->block);
10719
10720    /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */
10721    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
10722
10723    /* Get the id of the current wave within the threadgroup (workgroup) */
10724    Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10725                                             get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10726
10727    /* Execute the following code only on the first wave (wave id 0),
10728     * use the SCC def to tell if the wave id is zero or not.
10729     */
10730    Temp cond = wave_id_in_tg.def(1).getTemp();
10731    if_context ic;
10732    begin_uniform_if_then(ctx, &ic, cond);
10733    begin_uniform_if_else(ctx, &ic);
10734    bld.reset(ctx->block);
10735
10736    /* Number of vertices output by VS/TES */
10737    Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10738                            get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u)));
10739    /* Number of primitives output by VS/TES */
10740    Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10741                            get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u)));
10742
10743    /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
10744    Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
10745    tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
10746
10747    /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
10748    bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
10749
10750    end_uniform_if(ctx, &ic);
10751
10752    /* After the GS_ALLOC_REQ is done, reset priority to default (0). */
10753    bld.reset(ctx->block);
10754    bld.sopp(aco_opcode::s_setprio, -1u, 0x0u);
10755 }
10756
10757 Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[])
10758 {
10759    Builder bld(ctx->program, ctx->block);
10760
10761    if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
10762       return get_arg(ctx, ctx->args->gs_vtx_offset[0]);
10763    }
10764
10765    Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
10766    Temp tmp;
10767
10768    for (unsigned i = 0; i < num_vertices; ++i) {
10769       assert(vtxindex[i].id());
10770
10771       if (i)
10772          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
10773       else
10774          tmp = vtxindex[i];
10775
10776       /* The initial edge flag is always false in tess eval shaders. */
10777       if (ctx->stage == ngg_vertex_gs) {
10778          Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u));
10779          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
10780       }
10781    }
10782
10783    /* TODO: Set isnull field in case of merged NGG VS+GS. */
10784
10785    return tmp;
10786 }
10787
10788 void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[])
10789 {
10790    Builder bld(ctx->program, ctx->block);
10791    Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex);
10792
10793    bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
10794         1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
10795         false /* compressed */, true/* done */, false /* valid mask */);
10796 }
10797
10798 void ngg_emit_nogs_gsthreads(isel_context *ctx)
10799 {
10800    /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
10801     * These must always come before VS exports.
10802     *
10803     * It is recommended to do these as early as possible. They can be at the beginning when
10804     * there is no SW GS and the shader doesn't write edge flags.
10805     */
10806
10807    if_context ic;
10808    Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
10809    begin_divergent_if_then(ctx, &ic, is_gs_thread);
10810
10811    Builder bld(ctx->program, ctx->block);
10812    constexpr unsigned max_vertices_per_primitive = 3;
10813    unsigned num_vertices_per_primitive = max_vertices_per_primitive;
10814
10815    if (ctx->stage == ngg_vertex_gs) {
10816       /* TODO: optimize for points & lines */
10817    } else if (ctx->stage == ngg_tess_eval_gs) {
10818       if (ctx->shader->info.tess.point_mode)
10819          num_vertices_per_primitive = 1;
10820       else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
10821          num_vertices_per_primitive = 2;
10822    } else {
10823       unreachable("Unsupported NGG shader stage");
10824    }
10825
10826    Temp vtxindex[max_vertices_per_primitive];
10827    vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10828                           get_arg(ctx, ctx->args->gs_vtx_offset[0]));
10829    vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
10830                  bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
10831                           get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u));
10832    vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
10833                  bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10834                           get_arg(ctx, ctx->args->gs_vtx_offset[2]));
10835
10836    /* Export primitive data to the index buffer. */
10837    ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
10838
10839    /* Export primitive ID. */
10840    if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) {
10841       /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
10842       Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
10843       Temp provoking_vtx_index = vtxindex[0];
10844       Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
10845
10846       store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
10847    }
10848
10849    begin_divergent_if_else(ctx, &ic);
10850    end_divergent_if(ctx, &ic);
10851 }
10852
10853 void ngg_emit_nogs_output(isel_context *ctx)
10854 {
10855    /* Emits NGG GS output, for stages that don't have SW GS. */
10856
10857    if_context ic;
10858    Builder bld(ctx->program, ctx->block);
10859    bool late_prim_export = !ngg_early_prim_export(ctx);
10860
10861    /* NGG streamout is currently disabled by default. */
10862    assert(!ctx->args->shader_info->so.num_outputs);
10863
10864    if (late_prim_export) {
10865       /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */
10866       create_export_phis(ctx);
10867       /* Do what we need to do in the GS threads. */
10868       ngg_emit_nogs_gsthreads(ctx);
10869
10870       /* What comes next should be executed on ES threads. */
10871       Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
10872       begin_divergent_if_then(ctx, &ic, is_es_thread);
10873       bld.reset(ctx->block);
10874    }
10875
10876    /* Export VS outputs */
10877    ctx->block->kind |= block_kind_export_end;
10878    create_vs_exports(ctx);
10879
10880    /* Export primitive ID */
10881    if (ctx->args->options->key.vs_common_out.export_prim_id) {
10882       Temp prim_id;
10883
10884       if (ctx->stage == ngg_vertex_gs) {
10885          /* Wait for GS threads to store primitive ID in LDS. */
10886          bld.barrier(aco_opcode::p_memory_barrier_shared);
10887          bld.sopp(aco_opcode::s_barrier);
10888
10889          /* Calculate LDS address where the GS threads stored the primitive ID. */
10890          Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10891                                        get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10892          Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1));
10893          Temp wave_id_mul = bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size);
10894          Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave));
10895          Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
10896
10897          /* Load primitive ID from LDS. */
10898          prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
10899       } else if (ctx->stage == ngg_tess_eval_gs) {
10900          /* TES: Just use the patch ID as the primitive ID. */
10901          prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
10902       } else {
10903          unreachable("unsupported NGG shader stage.");
10904       }
10905
10906       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10907       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
10908
10909       export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
10910    }
10911
10912    if (late_prim_export) {
10913       begin_divergent_if_else(ctx, &ic);
10914       end_divergent_if(ctx, &ic);
10915       bld.reset(ctx->block);
10916    }
10917 }
10918
10919 void select_program(Program *program,
10920                     unsigned shader_count,
10921                     struct nir_shader *const *shaders,
10922                     ac_shader_config* config,
10923                     struct radv_shader_args *args)
10924 {
10925    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
10926    if_context ic_merged_wave_info;
10927    bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
10928
10929    for (unsigned i = 0; i < shader_count; i++) {
10930       nir_shader *nir = shaders[i];
10931       init_context(&ctx, nir);
10932
10933       setup_fp_mode(&ctx, nir);
10934
10935       if (!i) {
10936          /* needs to be after init_context() for FS */
10937          Pseudo_instruction *startpgm = add_startpgm(&ctx);
10938          append_logical_start(ctx.block);
10939
10940          if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
10941             fix_ls_vgpr_init_bug(&ctx, startpgm);
10942
10943          split_arguments(&ctx, startpgm);
10944       }
10945
10946       if (ngg_no_gs) {
10947          ngg_emit_sendmsg_gs_alloc_req(&ctx);
10948
10949          if (ngg_early_prim_export(&ctx))
10950             ngg_emit_nogs_gsthreads(&ctx);
10951       }
10952
10953       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
10954       nir_function_impl *func = nir_shader_get_entrypoint(nir);
10955       bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
10956                           ((nir->info.stage == MESA_SHADER_VERTEX &&
10957                             (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
10958                            (nir->info.stage == MESA_SHADER_TESS_EVAL &&
10959                             ctx.stage == tess_eval_geometry_gs));
10960
10961       bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
10962       bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
10963       if (check_merged_wave_info) {
10964          Temp cond = merged_wave_info_to_mask(&ctx, i);
10965          begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
10966       }
10967
10968       if (i) {
10969          Builder bld(ctx.program, ctx.block);
10970
10971          bld.barrier(aco_opcode::p_memory_barrier_shared);
10972          bld.sopp(aco_opcode::s_barrier);
10973
10974          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
10975             ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
10976          }
10977       } else if (ctx.stage == geometry_gs)
10978          ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id);
10979
10980       if (ctx.stage == fragment_fs)
10981          handle_bc_optimize(&ctx);
10982
10983       visit_cf_list(&ctx, &func->body);
10984
10985       if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
10986          emit_streamout(&ctx, 0);
10987
10988       if (ctx.stage & hw_vs) {
10989          create_vs_exports(&ctx);
10990          ctx.block->kind |= block_kind_export_end;
10991       } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) {
10992          ngg_emit_nogs_output(&ctx);
10993       } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
10994          Builder bld(ctx.program, ctx.block);
10995          bld.barrier(aco_opcode::p_memory_barrier_gs_data);
10996          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
10997       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
10998          write_tcs_tess_factors(&ctx);
10999       }
11000
11001       if (ctx.stage == fragment_fs) {
11002          create_fs_exports(&ctx);
11003          ctx.block->kind |= block_kind_export_end;
11004       }
11005
11006       if (endif_merged_wave_info) {
11007          begin_divergent_if_else(&ctx, &ic_merged_wave_info);
11008          end_divergent_if(&ctx, &ic_merged_wave_info);
11009       }
11010
11011       if (ngg_no_gs && !ngg_early_prim_export(&ctx))
11012          ngg_emit_nogs_output(&ctx);
11013
11014       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
11015          /* Outputs of the previous stage are inputs to the next stage */
11016          ctx.inputs = ctx.outputs;
11017          ctx.outputs = shader_io_state();
11018       }
11019    }
11020
11021    program->config->float_mode = program->blocks[0].fp_mode.val;
11022
11023    append_logical_end(ctx.block);
11024    ctx.block->kind |= block_kind_uniform;
11025    Builder bld(ctx.program, ctx.block);
11026    if (ctx.program->wb_smem_l1_on_end)
11027       bld.smem(aco_opcode::s_dcache_wb, false);
11028    bld.sopp(aco_opcode::s_endpgm);
11029
11030    cleanup_cfg(program);
11031 }
11032
11033 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
11034                            ac_shader_config* config,
11035                            struct radv_shader_args *args)
11036 {
11037    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11038
11039    ctx.block->fp_mode = program->next_fp_mode;
11040
11041    add_startpgm(&ctx);
11042    append_logical_start(ctx.block);
11043
11044    Builder bld(ctx.program, ctx.block);
11045
11046    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
11047
11048    Operand stream_id(0u);
11049    if (args->shader_info->so.num_outputs)
11050       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11051                            get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
11052
11053    Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
11054
11055    std::stack<Block> endif_blocks;
11056
11057    for (unsigned stream = 0; stream < 4; stream++) {
11058       if (stream_id.isConstant() && stream != stream_id.constantValue())
11059          continue;
11060
11061       unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11062       if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11063          continue;
11064
11065       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11066
11067       unsigned BB_if_idx = ctx.block->index;
11068       Block BB_endif = Block();
11069       if (!stream_id.isConstant()) {
11070          /* begin IF */
11071          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
11072          append_logical_end(ctx.block);
11073          ctx.block->kind |= block_kind_uniform;
11074          bld.branch(aco_opcode::p_cbranch_z, cond);
11075
11076          BB_endif.kind |= ctx.block->kind & block_kind_top_level;
11077
11078          ctx.block = ctx.program->create_and_insert_block();
11079          add_edge(BB_if_idx, ctx.block);
11080          bld.reset(ctx.block);
11081          append_logical_start(ctx.block);
11082       }
11083
11084       unsigned offset = 0;
11085       for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11086          if (args->shader_info->gs.output_streams[i] != stream)
11087             continue;
11088
11089          unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11090          unsigned length = util_last_bit(output_usage_mask);
11091          for (unsigned j = 0; j < length; ++j) {
11092             if (!(output_usage_mask & (1 << j)))
11093                continue;
11094
11095             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11096             Temp voffset = vtx_offset;
11097             if (const_offset >= 4096u) {
11098                voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
11099                const_offset %= 4096u;
11100             }
11101
11102             aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
11103             mubuf->definitions[0] = bld.def(v1);
11104             mubuf->operands[0] = Operand(gsvs_ring);
11105             mubuf->operands[1] = Operand(voffset);
11106             mubuf->operands[2] = Operand(0u);
11107             mubuf->offen = true;
11108             mubuf->offset = const_offset;
11109             mubuf->glc = true;
11110             mubuf->slc = true;
11111             mubuf->dlc = args->options->chip_class >= GFX10;
11112             mubuf->barrier = barrier_none;
11113             mubuf->can_reorder = true;
11114
11115             ctx.outputs.mask[i] |= 1 << j;
11116             ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
11117
11118             bld.insert(std::move(mubuf));
11119
11120             offset++;
11121          }
11122       }
11123
11124       if (args->shader_info->so.num_outputs) {
11125          emit_streamout(&ctx, stream);
11126          bld.reset(ctx.block);
11127       }
11128
11129       if (stream == 0) {
11130          create_vs_exports(&ctx);
11131          ctx.block->kind |= block_kind_export_end;
11132       }
11133
11134       if (!stream_id.isConstant()) {
11135          append_logical_end(ctx.block);
11136
11137          /* branch from then block to endif block */
11138          bld.branch(aco_opcode::p_branch);
11139          add_edge(ctx.block->index, &BB_endif);
11140          ctx.block->kind |= block_kind_uniform;
11141
11142          /* emit else block */
11143          ctx.block = ctx.program->create_and_insert_block();
11144          add_edge(BB_if_idx, ctx.block);
11145          bld.reset(ctx.block);
11146          append_logical_start(ctx.block);
11147
11148          endif_blocks.push(std::move(BB_endif));
11149       }
11150    }
11151
11152    while (!endif_blocks.empty()) {
11153       Block BB_endif = std::move(endif_blocks.top());
11154       endif_blocks.pop();
11155
11156       Block *BB_else = ctx.block;
11157
11158       append_logical_end(BB_else);
11159       /* branch from else block to endif block */
11160       bld.branch(aco_opcode::p_branch);
11161       add_edge(BB_else->index, &BB_endif);
11162       BB_else->kind |= block_kind_uniform;
11163
11164       /** emit endif merge block */
11165       ctx.block = program->insert_block(std::move(BB_endif));
11166       bld.reset(ctx.block);
11167       append_logical_start(ctx.block);
11168    }
11169
11170    program->config->float_mode = program->blocks[0].fp_mode.val;
11171
11172    append_logical_end(ctx.block);
11173    ctx.block->kind |= block_kind_uniform;
11174    bld.sopp(aco_opcode::s_endpgm);
11175
11176    cleanup_cfg(program);
11177 }
11178 }