src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <stack>
  29 #include <map>
  30
  31 #include "ac_shader_util.h"
  32 #include "aco_ir.h"
  33 #include "aco_builder.h"
  34 #include "aco_interface.h"
  35 #include "aco_instruction_selection_setup.cpp"
  36 #include "util/fast_idiv_by_const.h"
  37
  38 namespace aco {
  39 namespace {
  40
  41 class loop_info_RAII {
  42    isel_context* ctx;
  43    unsigned header_idx_old;
  44    Block* exit_old;
  45    bool divergent_cont_old;
  46    bool divergent_branch_old;
  47    bool divergent_if_old;
  48
  49 public:
  50    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  51       : ctx(ctx),
  52         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  53         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  54         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  55         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  56    {
  57       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  58       ctx->cf_info.parent_loop.exit = loop_exit;
  59       ctx->cf_info.parent_loop.has_divergent_continue = false;
  60       ctx->cf_info.parent_loop.has_divergent_branch = false;
  61       ctx->cf_info.parent_if.is_divergent = false;
  62       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  63    }
  64
  65    ~loop_info_RAII()
  66    {
  67       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  68       ctx->cf_info.parent_loop.exit = exit_old;
  69       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  70       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  71       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  72       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  73       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  74          ctx->cf_info.exec_potentially_empty_discard = false;
  75    }
  76 };
  77
  78 struct if_context {
  79    Temp cond;
  80
  81    bool divergent_old;
  82    bool exec_potentially_empty_discard_old;
  83    bool exec_potentially_empty_break_old;
  84    uint16_t exec_potentially_empty_break_depth_old;
  85
  86    unsigned BB_if_idx;
  87    unsigned invert_idx;
  88    bool uniform_has_then_branch;
  89    bool then_branch_divergent;
  90    Block BB_invert;
  91    Block BB_endif;
  92 };
  93
  94 static bool visit_cf_list(struct isel_context *ctx,
  95                           struct exec_list *list);
  96
  97 static void add_logical_edge(unsigned pred_idx, Block *succ)
  98 {
  99    succ->logical_preds.emplace_back(pred_idx);
 100 }
 101
 102
 103 static void add_linear_edge(unsigned pred_idx, Block *succ)
 104 {
 105    succ->linear_preds.emplace_back(pred_idx);
 106 }
 107
 108 static void add_edge(unsigned pred_idx, Block *succ)
 109 {
 110    add_logical_edge(pred_idx, succ);
 111    add_linear_edge(pred_idx, succ);
 112 }
 113
 114 static void append_logical_start(Block *b)
 115 {
 116    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 117 }
 118
 119 static void append_logical_end(Block *b)
 120 {
 121    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 122 }
 123
 124 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 125 {
 126    assert(ctx->allocated[def->index].id());
 127    return ctx->allocated[def->index];
 128 }
 129
 130 Temp emit_mbcnt(isel_context *ctx, Definition dst,
 131                 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
 132 {
 133    Builder bld(ctx->program, ctx->block);
 134    Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
 135    Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
 136
 137    if (ctx->program->wave_size == 32) {
 138       return thread_id_lo;
 139    } else {
 140       Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
 141       return thread_id_hi;
 142    }
 143 }
 144
 145 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 146 {
 147    Builder bld(ctx->program, ctx->block);
 148
 149    if (!dst.id())
 150       dst = bld.tmp(src.regClass());
 151
 152    assert(src.size() == dst.size());
 153
 154    if (ctx->stage != fragment_fs) {
 155       if (!dst.id())
 156          return src;
 157
 158       bld.copy(Definition(dst), src);
 159       return dst;
 160    }
 161
 162    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 163    ctx->program->needs_wqm |= program_needs_wqm;
 164    return dst;
 165 }
 166
 167 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
 168 {
 169    if (index.regClass() == s1)
 170       return bld.readlane(bld.def(s1), data, index);
 171
 172    Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 173
 174    /* Currently not implemented on GFX6-7 */
 175    assert(ctx->options->chip_class >= GFX8);
 176
 177    if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
 178       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 179    }
 180
 181    /* GFX10, wave64 mode:
 182     * The bpermute instruction is limited to half-wave operation, which means that it can't
 183     * properly support subgroup shuffle like older generations (or wave32 mode), so we
 184     * emulate it here.
 185     */
 186    if (!ctx->has_gfx10_wave64_bpermute) {
 187       ctx->has_gfx10_wave64_bpermute = true;
 188       ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
 189       ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
 190    }
 191
 192    Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
 193    Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
 194    Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
 195    Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
 196
 197    return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
 198                         bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
 199 }
 200
 201 Temp as_vgpr(isel_context *ctx, Temp val)
 202 {
 203    if (val.type() == RegType::sgpr) {
 204       Builder bld(ctx->program, ctx->block);
 205       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 206    }
 207    assert(val.type() == RegType::vgpr);
 208    return val;
 209 }
 210
 211 //assumes a != 0xffffffff
 212 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 213 {
 214    assert(b != 0);
 215    Builder bld(ctx->program, ctx->block);
 216
 217    if (util_is_power_of_two_or_zero(b)) {
 218       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 219       return;
 220    }
 221
 222    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 223
 224    assert(info.multiplier <= 0xffffffff);
 225
 226    bool pre_shift = info.pre_shift != 0;
 227    bool increment = info.increment != 0;
 228    bool multiply = true;
 229    bool post_shift = info.post_shift != 0;
 230
 231    if (!pre_shift && !increment && !multiply && !post_shift) {
 232       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 233       return;
 234    }
 235
 236    Temp pre_shift_dst = a;
 237    if (pre_shift) {
 238       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 239       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 240    }
 241
 242    Temp increment_dst = pre_shift_dst;
 243    if (increment) {
 244       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 245       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 246    }
 247
 248    Temp multiply_dst = increment_dst;
 249    if (multiply) {
 250       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 251       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 252                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 253    }
 254
 255    if (post_shift) {
 256       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 257    }
 258 }
 259
 260 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 261 {
 262    Builder bld(ctx->program, ctx->block);
 263    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 264 }
 265
 266
 267 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 268 {
 269    /* no need to extract the whole vector */
 270    if (src.regClass() == dst_rc) {
 271       assert(idx == 0);
 272       return src;
 273    }
 274
 275    assert(src.bytes() > (idx * dst_rc.bytes()));
 276    Builder bld(ctx->program, ctx->block);
 277    auto it = ctx->allocated_vec.find(src.id());
 278    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
 279       if (it->second[idx].regClass() == dst_rc) {
 280          return it->second[idx];
 281       } else {
 282          assert(!dst_rc.is_subdword());
 283          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 284          return bld.copy(bld.def(dst_rc), it->second[idx]);
 285       }
 286    }
 287
 288    if (dst_rc.is_subdword())
 289       src = as_vgpr(ctx, src);
 290
 291    if (src.bytes() == dst_rc.bytes()) {
 292       assert(idx == 0);
 293       return bld.copy(bld.def(dst_rc), src);
 294    } else {
 295       Temp dst = bld.tmp(dst_rc);
 296       emit_extract_vector(ctx, src, idx, dst);
 297       return dst;
 298    }
 299 }
 300
 301 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 302 {
 303    if (num_components == 1)
 304       return;
 305    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 306       return;
 307    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 308    split->operands[0] = Operand(vec_src);
 309    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 310    RegClass rc;
 311    if (num_components > vec_src.size()) {
 312       if (vec_src.type() == RegType::sgpr)
 313          return;
 314
 315       /* sub-dword split */
 316       assert(vec_src.type() == RegType::vgpr);
 317       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
 318    } else {
 319       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
 320    }
 321    for (unsigned i = 0; i < num_components; i++) {
 322       elems[i] = {ctx->program->allocateId(), rc};
 323       split->definitions[i] = Definition(elems[i]);
 324    }
 325    ctx->block->instructions.emplace_back(std::move(split));
 326    ctx->allocated_vec.emplace(vec_src.id(), elems);
 327 }
 328
 329 /* This vector expansion uses a mask to determine which elements in the new vector
 330  * come from the original vector. The other elements are undefined. */
 331 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 332 {
 333    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 334
 335    if (vec_src == dst)
 336       return;
 337
 338    Builder bld(ctx->program, ctx->block);
 339    if (num_components == 1) {
 340       if (dst.type() == RegType::sgpr)
 341          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 342       else
 343          bld.copy(Definition(dst), vec_src);
 344       return;
 345    }
 346
 347    unsigned component_size = dst.size() / num_components;
 348    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 349
 350    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 351    vec->definitions[0] = Definition(dst);
 352    unsigned k = 0;
 353    for (unsigned i = 0; i < num_components; i++) {
 354       if (mask & (1 << i)) {
 355          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 356          if (dst.type() == RegType::sgpr)
 357             src = bld.as_uniform(src);
 358          vec->operands[i] = Operand(src);
 359       } else {
 360          vec->operands[i] = Operand(0u);
 361       }
 362       elems[i] = vec->operands[i].getTemp();
 363    }
 364    ctx->block->instructions.emplace_back(std::move(vec));
 365    ctx->allocated_vec.emplace(dst.id(), elems);
 366 }
 367
 368 /* adjust misaligned small bit size loads */
 369 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
 370 {
 371    Builder bld(ctx->program, ctx->block);
 372    Operand shift;
 373    Temp select = Temp();
 374    if (offset.isConstant()) {
 375       assert(offset.constantValue() && offset.constantValue() < 4);
 376       shift = Operand(offset.constantValue() * 8);
 377    } else {
 378       /* bit_offset = 8 * (offset & 0x3) */
 379       Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
 380       select = bld.tmp(s1);
 381       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
 382    }
 383
 384    if (vec.size() == 1) {
 385       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
 386    } else if (vec.size() == 2) {
 387       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
 388       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
 389       if (tmp == dst)
 390          emit_split_vector(ctx, dst, 2);
 391       else
 392          emit_extract_vector(ctx, tmp, 0, dst);
 393    } else if (vec.size() == 4) {
 394       Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
 395       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 396       hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
 397       if (select != Temp())
 398          hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
 399       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
 400       Temp mid = bld.tmp(s1);
 401       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
 402       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
 403       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
 404       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
 405       emit_split_vector(ctx, dst, 2);
 406    }
 407 }
 408
 409 /* this function trims subdword vectors:
 410  * if dst is vgpr - split the src and create a shrunk version according to the mask.
 411  * if dst is sgpr - split the src, but move the original to sgpr. */
 412 void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 413 {
 414    assert(vec_src.type() == RegType::vgpr);
 415    emit_split_vector(ctx, vec_src, num_components);
 416
 417    Builder bld(ctx->program, ctx->block);
 418    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 419    unsigned component_size = vec_src.bytes() / num_components;
 420    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
 421
 422    unsigned k = 0;
 423    for (unsigned i = 0; i < num_components; i++) {
 424       if (mask & (1 << i))
 425          elems[k++] = emit_extract_vector(ctx, vec_src, i, rc);
 426    }
 427
 428    if (dst.type() == RegType::vgpr) {
 429       assert(dst.bytes() == k * component_size);
 430       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)};
 431       for (unsigned i = 0; i < k; i++)
 432          vec->operands[i] = Operand(elems[i]);
 433       vec->definitions[0] = Definition(dst);
 434       bld.insert(std::move(vec));
 435    } else {
 436       // TODO: alignbyte if mask doesn't start with 1?
 437       assert(mask & 1);
 438       assert(dst.size() == vec_src.size());
 439       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 440    }
 441    ctx->allocated_vec.emplace(dst.id(), elems);
 442 }
 443
 444 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
 445 {
 446    Builder bld(ctx->program, ctx->block);
 447    if (!dst.id())
 448       dst = bld.tmp(bld.lm);
 449
 450    assert(val.regClass() == s1);
 451    assert(dst.regClass() == bld.lm);
 452
 453    return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
 454 }
 455
 456 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
 457 {
 458    Builder bld(ctx->program, ctx->block);
 459    if (!dst.id())
 460       dst = bld.tmp(s1);
 461
 462    assert(val.regClass() == bld.lm);
 463    assert(dst.regClass() == s1);
 464
 465    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 466    Temp tmp = bld.tmp(s1);
 467    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
 468    return emit_wqm(ctx, tmp, dst);
 469 }
 470
 471 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 472 {
 473    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 474       return get_ssa_temp(ctx, src.src.ssa);
 475
 476    if (src.src.ssa->num_components == size) {
 477       bool identity_swizzle = true;
 478       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 479          if (src.swizzle[i] != i)
 480             identity_swizzle = false;
 481       }
 482       if (identity_swizzle)
 483          return get_ssa_temp(ctx, src.src.ssa);
 484    }
 485
 486    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 487    unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
 488    assert(elem_size > 0);
 489    assert(vec.bytes() % elem_size == 0);
 490
 491    if (elem_size < 4 && vec.type() == RegType::sgpr) {
 492       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
 493       assert(size == 1);
 494       unsigned swizzle = src.swizzle[0];
 495       if (vec.size() > 1) {
 496          assert(src.src.ssa->bit_size == 16);
 497          vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
 498          swizzle = swizzle & 1;
 499       }
 500       if (swizzle == 0)
 501          return vec;
 502
 503       Temp dst{ctx->program->allocateId(), s1};
 504       aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 1)};
 505       bfe->operands[0] = Operand(vec);
 506       bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
 507       bfe->definitions[0] = Definition(dst);
 508       ctx->block->instructions.emplace_back(std::move(bfe));
 509       return dst;
 510    }
 511
 512    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
 513    if (size == 1) {
 514       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 515    } else {
 516       assert(size <= 4);
 517       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 518       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 519       for (unsigned i = 0; i < size; ++i) {
 520          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 521          vec_instr->operands[i] = Operand{elems[i]};
 522       }
 523       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
 524       vec_instr->definitions[0] = Definition(dst);
 525       ctx->block->instructions.emplace_back(std::move(vec_instr));
 526       ctx->allocated_vec.emplace(dst.id(), elems);
 527       return dst;
 528    }
 529 }
 530
 531 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 532 {
 533    if (ptr.size() == 2)
 534       return ptr;
 535    Builder bld(ctx->program, ctx->block);
 536    if (ptr.type() == RegType::vgpr)
 537       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 538    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 539                      ptr, Operand((unsigned)ctx->options->address32_hi));
 540 }
 541
 542 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 543 {
 544    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 545    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 546    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 547    sop2->definitions[0] = Definition(dst);
 548    if (writes_scc)
 549       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 550    ctx->block->instructions.emplace_back(std::move(sop2));
 551 }
 552
 553 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 554                            bool commutative, bool swap_srcs=false, bool flush_denorms = false)
 555 {
 556    Builder bld(ctx->program, ctx->block);
 557    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 558    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 559    if (src1.type() == RegType::sgpr) {
 560       if (commutative && src0.type() == RegType::vgpr) {
 561          Temp t = src0;
 562          src0 = src1;
 563          src1 = t;
 564       } else if (src0.type() == RegType::vgpr &&
 565                  op != aco_opcode::v_madmk_f32 &&
 566                  op != aco_opcode::v_madak_f32 &&
 567                  op != aco_opcode::v_madmk_f16 &&
 568                  op != aco_opcode::v_madak_f16) {
 569          /* If the instruction is not commutative, we emit a VOP3A instruction */
 570          bld.vop2_e64(op, Definition(dst), src0, src1);
 571          return;
 572       } else {
 573          src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
 574       }
 575    }
 576
 577    if (flush_denorms && ctx->program->chip_class < GFX9) {
 578       assert(dst.size() == 1);
 579       Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
 580       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 581    } else {
 582       bld.vop2(op, Definition(dst), src0, src1);
 583    }
 584 }
 585
 586 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 587                             bool flush_denorms = false)
 588 {
 589    Temp src0 = get_alu_src(ctx, instr->src[0]);
 590    Temp src1 = get_alu_src(ctx, instr->src[1]);
 591    Temp src2 = get_alu_src(ctx, instr->src[2]);
 592
 593    /* ensure that the instruction has at most 1 sgpr operand
 594     * The optimizer will inline constants for us */
 595    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 596       src0 = as_vgpr(ctx, src0);
 597    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 598       src1 = as_vgpr(ctx, src1);
 599    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 600       src2 = as_vgpr(ctx, src2);
 601
 602    Builder bld(ctx->program, ctx->block);
 603    if (flush_denorms && ctx->program->chip_class < GFX9) {
 604       assert(dst.size() == 1);
 605       Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
 606       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 607    } else {
 608       bld.vop3(op, Definition(dst), src0, src1, src2);
 609    }
 610 }
 611
 612 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 613 {
 614    Builder bld(ctx->program, ctx->block);
 615    bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 616 }
 617
 618 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 619 {
 620    Temp src0 = get_alu_src(ctx, instr->src[0]);
 621    Temp src1 = get_alu_src(ctx, instr->src[1]);
 622    assert(src0.size() == src1.size());
 623
 624    aco_ptr<Instruction> vopc;
 625    if (src1.type() == RegType::sgpr) {
 626       if (src0.type() == RegType::vgpr) {
 627          /* to swap the operands, we might also have to change the opcode */
 628          switch (op) {
 629             case aco_opcode::v_cmp_lt_f32:
 630                op = aco_opcode::v_cmp_gt_f32;
 631                break;
 632             case aco_opcode::v_cmp_ge_f32:
 633                op = aco_opcode::v_cmp_le_f32;
 634                break;
 635             case aco_opcode::v_cmp_lt_i32:
 636                op = aco_opcode::v_cmp_gt_i32;
 637                break;
 638             case aco_opcode::v_cmp_ge_i32:
 639                op = aco_opcode::v_cmp_le_i32;
 640                break;
 641             case aco_opcode::v_cmp_lt_u32:
 642                op = aco_opcode::v_cmp_gt_u32;
 643                break;
 644             case aco_opcode::v_cmp_ge_u32:
 645                op = aco_opcode::v_cmp_le_u32;
 646                break;
 647             case aco_opcode::v_cmp_lt_f64:
 648                op = aco_opcode::v_cmp_gt_f64;
 649                break;
 650             case aco_opcode::v_cmp_ge_f64:
 651                op = aco_opcode::v_cmp_le_f64;
 652                break;
 653             case aco_opcode::v_cmp_lt_i64:
 654                op = aco_opcode::v_cmp_gt_i64;
 655                break;
 656             case aco_opcode::v_cmp_ge_i64:
 657                op = aco_opcode::v_cmp_le_i64;
 658                break;
 659             case aco_opcode::v_cmp_lt_u64:
 660                op = aco_opcode::v_cmp_gt_u64;
 661                break;
 662             case aco_opcode::v_cmp_ge_u64:
 663                op = aco_opcode::v_cmp_le_u64;
 664                break;
 665             default: /* eq and ne are commutative */
 666                break;
 667          }
 668          Temp t = src0;
 669          src0 = src1;
 670          src1 = t;
 671       } else {
 672          src1 = as_vgpr(ctx, src1);
 673       }
 674    }
 675
 676    Builder bld(ctx->program, ctx->block);
 677    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
 678 }
 679
 680 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 681 {
 682    Temp src0 = get_alu_src(ctx, instr->src[0]);
 683    Temp src1 = get_alu_src(ctx, instr->src[1]);
 684    Builder bld(ctx->program, ctx->block);
 685
 686    assert(dst.regClass() == bld.lm);
 687    assert(src0.type() == RegType::sgpr);
 688    assert(src1.type() == RegType::sgpr);
 689    assert(src0.regClass() == src1.regClass());
 690
 691    /* Emit the SALU comparison instruction */
 692    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
 693    /* Turn the result into a per-lane bool */
 694    bool_to_vector_condition(ctx, cmp, dst);
 695 }
 696
 697 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
 698                      aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
 699 {
 700    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : s32_op;
 701    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : v32_op;
 702    bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
 703    bool use_valu = s_op == aco_opcode::num_opcodes ||
 704                    divergent_vals ||
 705                    ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
 706                    ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
 707    aco_opcode op = use_valu ? v_op : s_op;
 708    assert(op != aco_opcode::num_opcodes);
 709    assert(dst.regClass() == ctx->program->lane_mask);
 710
 711    if (use_valu)
 712       emit_vopc_instruction(ctx, instr, op, dst);
 713    else
 714       emit_sopc_instruction(ctx, instr, op, dst);
 715 }
 716
 717 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
 718 {
 719    Builder bld(ctx->program, ctx->block);
 720    Temp src0 = get_alu_src(ctx, instr->src[0]);
 721    Temp src1 = get_alu_src(ctx, instr->src[1]);
 722
 723    assert(dst.regClass() == bld.lm);
 724    assert(src0.regClass() == bld.lm);
 725    assert(src1.regClass() == bld.lm);
 726
 727    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
 728 }
 729
 730 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 731 {
 732    Builder bld(ctx->program, ctx->block);
 733    Temp cond = get_alu_src(ctx, instr->src[0]);
 734    Temp then = get_alu_src(ctx, instr->src[1]);
 735    Temp els = get_alu_src(ctx, instr->src[2]);
 736
 737    assert(cond.regClass() == bld.lm);
 738
 739    if (dst.type() == RegType::vgpr) {
 740       aco_ptr<Instruction> bcsel;
 741       if (dst.size() == 1) {
 742          then = as_vgpr(ctx, then);
 743          els = as_vgpr(ctx, els);
 744
 745          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 746       } else if (dst.size() == 2) {
 747          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 748          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 749          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 750          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 751
 752          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 753          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 754
 755          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 756       } else {
 757          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 758          nir_print_instr(&instr->instr, stderr);
 759          fprintf(stderr, "\n");
 760       }
 761       return;
 762    }
 763
 764    if (instr->dest.dest.ssa.bit_size == 1) {
 765       assert(dst.regClass() == bld.lm);
 766       assert(then.regClass() == bld.lm);
 767       assert(els.regClass() == bld.lm);
 768    }
 769
 770    if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
 771       if (dst.regClass() == s1 || dst.regClass() == s2) {
 772          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 773          assert(dst.size() == then.size());
 774          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 775          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
 776       } else {
 777          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 778          nir_print_instr(&instr->instr, stderr);
 779          fprintf(stderr, "\n");
 780       }
 781       return;
 782    }
 783
 784    /* divergent boolean bcsel
 785     * this implements bcsel on bools: dst = s0 ? s1 : s2
 786     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 787    assert(instr->dest.dest.ssa.bit_size == 1);
 788
 789    if (cond.id() != then.id())
 790       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
 791
 792    if (cond.id() == els.id())
 793       bld.sop1(Builder::s_mov, Definition(dst), then);
 794    else
 795       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
 796                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
 797 }
 798
 799 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
 800                     aco_opcode op, uint32_t undo)
 801 {
 802    /* multiply by 16777216 to handle denormals */
 803    Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
 804                                as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
 805    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
 806    scaled = bld.vop1(op, bld.def(v1), scaled);
 807    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
 808
 809    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
 810
 811    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
 812 }
 813
 814 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 815 {
 816    if (ctx->block->fp_mode.denorm32 == 0) {
 817       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
 818       return;
 819    }
 820
 821    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
 822 }
 823
 824 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 825 {
 826    if (ctx->block->fp_mode.denorm32 == 0) {
 827       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
 828       return;
 829    }
 830
 831    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
 832 }
 833
 834 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 835 {
 836    if (ctx->block->fp_mode.denorm32 == 0) {
 837       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
 838       return;
 839    }
 840
 841    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
 842 }
 843
 844 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 845 {
 846    if (ctx->block->fp_mode.denorm32 == 0) {
 847       bld.vop1(aco_opcode::v_log_f32, dst, val);
 848       return;
 849    }
 850
 851    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
 852 }
 853
 854 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 855 {
 856    if (ctx->options->chip_class >= GFX7)
 857       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
 858
 859    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
 860    /* TODO: create more efficient code! */
 861    if (val.type() == RegType::sgpr)
 862       val = as_vgpr(ctx, val);
 863
 864    /* Split the input value. */
 865    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
 866    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
 867
 868    /* Extract the exponent and compute the unbiased value. */
 869    Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val);
 870
 871    /* Extract the fractional part. */
 872    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
 873    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
 874
 875    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
 876    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
 877
 878    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
 879    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
 880    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
 881    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
 882    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
 883
 884    /* Get the sign bit. */
 885    Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi);
 886
 887    /* Decide the operation to apply depending on the unbiased exponent. */
 888    Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
 889    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
 890    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
 891    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
 892    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
 893    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
 894
 895    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
 896 }
 897
 898 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 899 {
 900    if (ctx->options->chip_class >= GFX7)
 901       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
 902
 903    /* GFX6 doesn't support V_FLOOR_F64, lower it. */
 904    Temp src0 = as_vgpr(ctx, val);
 905
 906    Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
 907    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
 908
 909    Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
 910    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
 911    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
 912
 913    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 914    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
 915    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 916    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
 917
 918    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
 919    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
 920
 921    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
 922
 923    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
 924    static_cast<VOP3A_instruction*>(add)->neg[1] = true;
 925
 926    return add->definitions[0].getTemp();
 927 }
 928
 929 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 930 {
 931    if (!instr->dest.dest.is_ssa) {
 932       fprintf(stderr, "nir alu dst not in ssa: ");
 933       nir_print_instr(&instr->instr, stderr);
 934       fprintf(stderr, "\n");
 935       abort();
 936    }
 937    Builder bld(ctx->program, ctx->block);
 938    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
 939    switch(instr->op) {
 940    case nir_op_vec2:
 941    case nir_op_vec3:
 942    case nir_op_vec4: {
 943       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 944       unsigned num = instr->dest.dest.ssa.num_components;
 945       for (unsigned i = 0; i < num; ++i)
 946          elems[i] = get_alu_src(ctx, instr->src[i]);
 947
 948       if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
 949          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
 950          for (unsigned i = 0; i < num; ++i)
 951             vec->operands[i] = Operand{elems[i]};
 952          vec->definitions[0] = Definition(dst);
 953          ctx->block->instructions.emplace_back(std::move(vec));
 954          ctx->allocated_vec.emplace(dst.id(), elems);
 955       } else {
 956          // TODO: that is a bit suboptimal..
 957          Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
 958          for (unsigned i = 0; i < num - 1; ++i)
 959             if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
 960                elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
 961          for (unsigned i = 0; i < num; ++i) {
 962             unsigned bit = i * instr->dest.dest.ssa.bit_size;
 963             if (bit % 32 == 0) {
 964                elems[bit / 32] = elems[i];
 965             } else {
 966                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
 967                                    elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
 968                elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
 969             }
 970          }
 971          if (dst.size() == 1)
 972             bld.copy(Definition(dst), elems[0]);
 973          else
 974             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
 975       }
 976       break;
 977    }
 978    case nir_op_mov: {
 979       Temp src = get_alu_src(ctx, instr->src[0]);
 980       aco_ptr<Instruction> mov;
 981       if (dst.type() == RegType::sgpr) {
 982          if (src.type() == RegType::vgpr)
 983             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
 984          else if (src.regClass() == s1)
 985             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 986          else if (src.regClass() == s2)
 987             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
 988          else
 989             unreachable("wrong src register class for nir_op_imov");
 990       } else if (dst.regClass() == v1) {
 991          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
 992       } else if (dst.regClass() == v2) {
 993          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
 994       } else {
 995          nir_print_instr(&instr->instr, stderr);
 996          unreachable("Should have been lowered to scalar.");
 997       }
 998       break;
 999    }
1000    case nir_op_inot: {
1001       Temp src = get_alu_src(ctx, instr->src[0]);
1002       if (instr->dest.dest.ssa.bit_size == 1) {
1003          assert(src.regClass() == bld.lm);
1004          assert(dst.regClass() == bld.lm);
1005          /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1006          Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1007          bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1008       } else if (dst.regClass() == v1) {
1009          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1010       } else if (dst.type() == RegType::sgpr) {
1011          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1012          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1013       } else {
1014          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1015          nir_print_instr(&instr->instr, stderr);
1016          fprintf(stderr, "\n");
1017       }
1018       break;
1019    }
1020    case nir_op_ineg: {
1021       Temp src = get_alu_src(ctx, instr->src[0]);
1022       if (dst.regClass() == v1) {
1023          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1024       } else if (dst.regClass() == s1) {
1025          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1026       } else if (dst.size() == 2) {
1027          Temp src0 = bld.tmp(dst.type(), 1);
1028          Temp src1 = bld.tmp(dst.type(), 1);
1029          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1030
1031          if (dst.regClass() == s2) {
1032             Temp carry = bld.tmp(s1);
1033             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1034             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1035             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1036          } else {
1037             Temp lower = bld.tmp(v1);
1038             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1039             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1040             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1041          }
1042       } else {
1043          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1044          nir_print_instr(&instr->instr, stderr);
1045          fprintf(stderr, "\n");
1046       }
1047       break;
1048    }
1049    case nir_op_iabs: {
1050       if (dst.regClass() == s1) {
1051          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1052       } else if (dst.regClass() == v1) {
1053          Temp src = get_alu_src(ctx, instr->src[0]);
1054          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1055       } else {
1056          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1057          nir_print_instr(&instr->instr, stderr);
1058          fprintf(stderr, "\n");
1059       }
1060       break;
1061    }
1062    case nir_op_isign: {
1063       Temp src = get_alu_src(ctx, instr->src[0]);
1064       if (dst.regClass() == s1) {
1065          Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1066          Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
1067          bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
1068       } else if (dst.regClass() == s2) {
1069          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1070          Temp neqz;
1071          if (ctx->program->chip_class >= GFX8)
1072             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1073          else
1074             neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1075          /* SCC gets zero-extended to 64 bit */
1076          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1077       } else if (dst.regClass() == v1) {
1078          Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1079          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1080          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
1081       } else if (dst.regClass() == v2) {
1082          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1083          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1084          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1085          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1086          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1087          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1088       } else {
1089          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1090          nir_print_instr(&instr->instr, stderr);
1091          fprintf(stderr, "\n");
1092       }
1093       break;
1094    }
1095    case nir_op_imax: {
1096       if (dst.regClass() == v1) {
1097          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1098       } else if (dst.regClass() == s1) {
1099          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1100       } else {
1101          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1102          nir_print_instr(&instr->instr, stderr);
1103          fprintf(stderr, "\n");
1104       }
1105       break;
1106    }
1107    case nir_op_umax: {
1108       if (dst.regClass() == v1) {
1109          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1110       } else if (dst.regClass() == s1) {
1111          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1112       } else {
1113          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1114          nir_print_instr(&instr->instr, stderr);
1115          fprintf(stderr, "\n");
1116       }
1117       break;
1118    }
1119    case nir_op_imin: {
1120       if (dst.regClass() == v1) {
1121          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1122       } else if (dst.regClass() == s1) {
1123          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1124       } else {
1125          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1126          nir_print_instr(&instr->instr, stderr);
1127          fprintf(stderr, "\n");
1128       }
1129       break;
1130    }
1131    case nir_op_umin: {
1132       if (dst.regClass() == v1) {
1133          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1134       } else if (dst.regClass() == s1) {
1135          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1136       } else {
1137          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1138          nir_print_instr(&instr->instr, stderr);
1139          fprintf(stderr, "\n");
1140       }
1141       break;
1142    }
1143    case nir_op_ior: {
1144       if (instr->dest.dest.ssa.bit_size == 1) {
1145          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1146       } else if (dst.regClass() == v1) {
1147          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1148       } else if (dst.regClass() == s1) {
1149          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1150       } else if (dst.regClass() == s2) {
1151          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1152       } else {
1153          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1154          nir_print_instr(&instr->instr, stderr);
1155          fprintf(stderr, "\n");
1156       }
1157       break;
1158    }
1159    case nir_op_iand: {
1160       if (instr->dest.dest.ssa.bit_size == 1) {
1161          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1162       } else if (dst.regClass() == v1) {
1163          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1164       } else if (dst.regClass() == s1) {
1165          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1166       } else if (dst.regClass() == s2) {
1167          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1168       } else {
1169          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1170          nir_print_instr(&instr->instr, stderr);
1171          fprintf(stderr, "\n");
1172       }
1173       break;
1174    }
1175    case nir_op_ixor: {
1176       if (instr->dest.dest.ssa.bit_size == 1) {
1177          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1178       } else if (dst.regClass() == v1) {
1179          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1180       } else if (dst.regClass() == s1) {
1181          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1182       } else if (dst.regClass() == s2) {
1183          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1184       } else {
1185          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1186          nir_print_instr(&instr->instr, stderr);
1187          fprintf(stderr, "\n");
1188       }
1189       break;
1190    }
1191    case nir_op_ushr: {
1192       if (dst.regClass() == v1) {
1193          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1194       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1195          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1196                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1197       } else if (dst.regClass() == v2) {
1198          bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1199                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1200       } else if (dst.regClass() == s2) {
1201          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1202       } else if (dst.regClass() == s1) {
1203          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1204       } else {
1205          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1206          nir_print_instr(&instr->instr, stderr);
1207          fprintf(stderr, "\n");
1208       }
1209       break;
1210    }
1211    case nir_op_ishl: {
1212       if (dst.regClass() == v1) {
1213          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1214       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1215          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1216                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1217       } else if (dst.regClass() == v2) {
1218          bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1219                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1220       } else if (dst.regClass() == s1) {
1221          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1222       } else if (dst.regClass() == s2) {
1223          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1224       } else {
1225          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1226          nir_print_instr(&instr->instr, stderr);
1227          fprintf(stderr, "\n");
1228       }
1229       break;
1230    }
1231    case nir_op_ishr: {
1232       if (dst.regClass() == v1) {
1233          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1234       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1235          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1236                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1237       } else if (dst.regClass() == v2) {
1238          bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1239                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1240       } else if (dst.regClass() == s1) {
1241          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1242       } else if (dst.regClass() == s2) {
1243          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1244       } else {
1245          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1246          nir_print_instr(&instr->instr, stderr);
1247          fprintf(stderr, "\n");
1248       }
1249       break;
1250    }
1251    case nir_op_find_lsb: {
1252       Temp src = get_alu_src(ctx, instr->src[0]);
1253       if (src.regClass() == s1) {
1254          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1255       } else if (src.regClass() == v1) {
1256          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1257       } else if (src.regClass() == s2) {
1258          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1259       } else {
1260          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1261          nir_print_instr(&instr->instr, stderr);
1262          fprintf(stderr, "\n");
1263       }
1264       break;
1265    }
1266    case nir_op_ufind_msb:
1267    case nir_op_ifind_msb: {
1268       Temp src = get_alu_src(ctx, instr->src[0]);
1269       if (src.regClass() == s1 || src.regClass() == s2) {
1270          aco_opcode op = src.regClass() == s2 ?
1271                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1272                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1273          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1274
1275          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1276                                         Operand(src.size() * 32u - 1u), msb_rev);
1277          Temp msb = sub.def(0).getTemp();
1278          Temp carry = sub.def(1).getTemp();
1279
1280          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1281       } else if (src.regClass() == v1) {
1282          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1283          Temp msb_rev = bld.tmp(v1);
1284          emit_vop1_instruction(ctx, instr, op, msb_rev);
1285          Temp msb = bld.tmp(v1);
1286          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1287          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1288       } else {
1289          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1290          nir_print_instr(&instr->instr, stderr);
1291          fprintf(stderr, "\n");
1292       }
1293       break;
1294    }
1295    case nir_op_bitfield_reverse: {
1296       if (dst.regClass() == s1) {
1297          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1298       } else if (dst.regClass() == v1) {
1299          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1300       } else {
1301          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1302          nir_print_instr(&instr->instr, stderr);
1303          fprintf(stderr, "\n");
1304       }
1305       break;
1306    }
1307    case nir_op_iadd: {
1308       if (dst.regClass() == s1) {
1309          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1310          break;
1311       }
1312
1313       Temp src0 = get_alu_src(ctx, instr->src[0]);
1314       Temp src1 = get_alu_src(ctx, instr->src[1]);
1315       if (dst.regClass() == v1) {
1316          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1317          break;
1318       }
1319
1320       assert(src0.size() == 2 && src1.size() == 2);
1321       Temp src00 = bld.tmp(src0.type(), 1);
1322       Temp src01 = bld.tmp(dst.type(), 1);
1323       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1324       Temp src10 = bld.tmp(src1.type(), 1);
1325       Temp src11 = bld.tmp(dst.type(), 1);
1326       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1327
1328       if (dst.regClass() == s2) {
1329          Temp carry = bld.tmp(s1);
1330          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1331          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1332          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1333       } else if (dst.regClass() == v2) {
1334          Temp dst0 = bld.tmp(v1);
1335          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1336          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1337          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1338       } else {
1339          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1340          nir_print_instr(&instr->instr, stderr);
1341          fprintf(stderr, "\n");
1342       }
1343       break;
1344    }
1345    case nir_op_uadd_sat: {
1346       Temp src0 = get_alu_src(ctx, instr->src[0]);
1347       Temp src1 = get_alu_src(ctx, instr->src[1]);
1348       if (dst.regClass() == s1) {
1349          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1350          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1351                   src0, src1);
1352          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1353       } else if (dst.regClass() == v1) {
1354          if (ctx->options->chip_class >= GFX9) {
1355             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1356             add->operands[0] = Operand(src0);
1357             add->operands[1] = Operand(src1);
1358             add->definitions[0] = Definition(dst);
1359             add->clamp = 1;
1360             ctx->block->instructions.emplace_back(std::move(add));
1361          } else {
1362             if (src1.regClass() != v1)
1363                std::swap(src0, src1);
1364             assert(src1.regClass() == v1);
1365             Temp tmp = bld.tmp(v1);
1366             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1367             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1368          }
1369       } else {
1370          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1371          nir_print_instr(&instr->instr, stderr);
1372          fprintf(stderr, "\n");
1373       }
1374       break;
1375    }
1376    case nir_op_uadd_carry: {
1377       Temp src0 = get_alu_src(ctx, instr->src[0]);
1378       Temp src1 = get_alu_src(ctx, instr->src[1]);
1379       if (dst.regClass() == s1) {
1380          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1381          break;
1382       }
1383       if (dst.regClass() == v1) {
1384          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1385          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1386          break;
1387       }
1388
1389       Temp src00 = bld.tmp(src0.type(), 1);
1390       Temp src01 = bld.tmp(dst.type(), 1);
1391       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1392       Temp src10 = bld.tmp(src1.type(), 1);
1393       Temp src11 = bld.tmp(dst.type(), 1);
1394       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1395       if (dst.regClass() == s2) {
1396          Temp carry = bld.tmp(s1);
1397          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1398          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1399          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1400       } else if (dst.regClass() == v2) {
1401          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1402          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1403          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1404          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1405       } else {
1406          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1407          nir_print_instr(&instr->instr, stderr);
1408          fprintf(stderr, "\n");
1409       }
1410       break;
1411    }
1412    case nir_op_isub: {
1413       if (dst.regClass() == s1) {
1414          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1415          break;
1416       }
1417
1418       Temp src0 = get_alu_src(ctx, instr->src[0]);
1419       Temp src1 = get_alu_src(ctx, instr->src[1]);
1420       if (dst.regClass() == v1) {
1421          bld.vsub32(Definition(dst), src0, src1);
1422          break;
1423       }
1424
1425       Temp src00 = bld.tmp(src0.type(), 1);
1426       Temp src01 = bld.tmp(dst.type(), 1);
1427       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1428       Temp src10 = bld.tmp(src1.type(), 1);
1429       Temp src11 = bld.tmp(dst.type(), 1);
1430       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1431       if (dst.regClass() == s2) {
1432          Temp carry = bld.tmp(s1);
1433          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1434          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1435          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1436       } else if (dst.regClass() == v2) {
1437          Temp lower = bld.tmp(v1);
1438          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1439          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1440          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1441       } else {
1442          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1443          nir_print_instr(&instr->instr, stderr);
1444          fprintf(stderr, "\n");
1445       }
1446       break;
1447    }
1448    case nir_op_usub_borrow: {
1449       Temp src0 = get_alu_src(ctx, instr->src[0]);
1450       Temp src1 = get_alu_src(ctx, instr->src[1]);
1451       if (dst.regClass() == s1) {
1452          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1453          break;
1454       } else if (dst.regClass() == v1) {
1455          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1456          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1457          break;
1458       }
1459
1460       Temp src00 = bld.tmp(src0.type(), 1);
1461       Temp src01 = bld.tmp(dst.type(), 1);
1462       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1463       Temp src10 = bld.tmp(src1.type(), 1);
1464       Temp src11 = bld.tmp(dst.type(), 1);
1465       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1466       if (dst.regClass() == s2) {
1467          Temp borrow = bld.tmp(s1);
1468          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1469          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1470          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1471       } else if (dst.regClass() == v2) {
1472          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1473          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1474          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1475          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1476       } else {
1477          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1478          nir_print_instr(&instr->instr, stderr);
1479          fprintf(stderr, "\n");
1480       }
1481       break;
1482    }
1483    case nir_op_imul: {
1484       if (dst.regClass() == v1) {
1485          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1486                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1487       } else if (dst.regClass() == s1) {
1488          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1489       } else {
1490          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1491          nir_print_instr(&instr->instr, stderr);
1492          fprintf(stderr, "\n");
1493       }
1494       break;
1495    }
1496    case nir_op_umul_high: {
1497       if (dst.regClass() == v1) {
1498          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1499       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1500          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1501       } else if (dst.regClass() == s1) {
1502          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1503                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1504          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1505       } else {
1506          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1507          nir_print_instr(&instr->instr, stderr);
1508          fprintf(stderr, "\n");
1509       }
1510       break;
1511    }
1512    case nir_op_imul_high: {
1513       if (dst.regClass() == v1) {
1514          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1515       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1516          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1517       } else if (dst.regClass() == s1) {
1518          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1519                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1520          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1521       } else {
1522          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1523          nir_print_instr(&instr->instr, stderr);
1524          fprintf(stderr, "\n");
1525       }
1526       break;
1527    }
1528    case nir_op_fmul: {
1529       Temp src0 = get_alu_src(ctx, instr->src[0]);
1530       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1531       if (dst.regClass() == v2b) {
1532          Temp tmp = bld.tmp(v1);
1533          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true);
1534          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1535       } else if (dst.regClass() == v1) {
1536          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1537       } else if (dst.regClass() == v2) {
1538          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1539       } else {
1540          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1541          nir_print_instr(&instr->instr, stderr);
1542          fprintf(stderr, "\n");
1543       }
1544       break;
1545    }
1546    case nir_op_fadd: {
1547       Temp src0 = get_alu_src(ctx, instr->src[0]);
1548       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1549       if (dst.regClass() == v2b) {
1550          Temp tmp = bld.tmp(v1);
1551          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true);
1552          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1553       } else if (dst.regClass() == v1) {
1554          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1555       } else if (dst.regClass() == v2) {
1556          bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1557       } else {
1558          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1559          nir_print_instr(&instr->instr, stderr);
1560          fprintf(stderr, "\n");
1561       }
1562       break;
1563    }
1564    case nir_op_fsub: {
1565       Temp src0 = get_alu_src(ctx, instr->src[0]);
1566       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1567       if (dst.regClass() == v2b) {
1568          Temp tmp = bld.tmp(v1);
1569          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1570             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false);
1571          else
1572             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true);
1573          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1574       } else if (dst.regClass() == v1) {
1575          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1576             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1577          else
1578             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1579       } else if (dst.regClass() == v2) {
1580          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1581                                      src0, src1);
1582          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1583          sub->neg[1] = true;
1584       } else {
1585          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1586          nir_print_instr(&instr->instr, stderr);
1587          fprintf(stderr, "\n");
1588       }
1589       break;
1590    }
1591    case nir_op_fmax: {
1592       Temp src0 = get_alu_src(ctx, instr->src[0]);
1593       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1594       if (dst.regClass() == v2b) {
1595          // TODO: check fp_mode.must_flush_denorms16_64
1596          Temp tmp = bld.tmp(v1);
1597          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true);
1598          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1599       } else if (dst.regClass() == v1) {
1600          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1601       } else if (dst.regClass() == v2) {
1602          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1603             Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1604             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1605          } else {
1606             bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1607          }
1608       } else {
1609          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1610          nir_print_instr(&instr->instr, stderr);
1611          fprintf(stderr, "\n");
1612       }
1613       break;
1614    }
1615    case nir_op_fmin: {
1616       Temp src0 = get_alu_src(ctx, instr->src[0]);
1617       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1618       if (dst.regClass() == v2b) {
1619          // TODO: check fp_mode.must_flush_denorms16_64
1620          Temp tmp = bld.tmp(v1);
1621          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true);
1622          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1623       } else if (dst.regClass() == v1) {
1624          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1625       } else if (dst.regClass() == v2) {
1626          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1627             Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1628             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1629          } else {
1630             bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1631          }
1632       } else {
1633          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1634          nir_print_instr(&instr->instr, stderr);
1635          fprintf(stderr, "\n");
1636       }
1637       break;
1638    }
1639    case nir_op_fmax3: {
1640       if (dst.size() == 1) {
1641          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1642       } else {
1643          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1644          nir_print_instr(&instr->instr, stderr);
1645          fprintf(stderr, "\n");
1646       }
1647       break;
1648    }
1649    case nir_op_fmin3: {
1650       if (dst.size() == 1) {
1651          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1652       } else {
1653          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1654          nir_print_instr(&instr->instr, stderr);
1655          fprintf(stderr, "\n");
1656       }
1657       break;
1658    }
1659    case nir_op_fmed3: {
1660       if (dst.size() == 1) {
1661          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1662       } else {
1663          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1664          nir_print_instr(&instr->instr, stderr);
1665          fprintf(stderr, "\n");
1666       }
1667       break;
1668    }
1669    case nir_op_umax3: {
1670       if (dst.size() == 1) {
1671          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1672       } else {
1673          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1674          nir_print_instr(&instr->instr, stderr);
1675          fprintf(stderr, "\n");
1676       }
1677       break;
1678    }
1679    case nir_op_umin3: {
1680       if (dst.size() == 1) {
1681          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1682       } else {
1683          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1684          nir_print_instr(&instr->instr, stderr);
1685          fprintf(stderr, "\n");
1686       }
1687       break;
1688    }
1689    case nir_op_umed3: {
1690       if (dst.size() == 1) {
1691          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1692       } else {
1693          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1694          nir_print_instr(&instr->instr, stderr);
1695          fprintf(stderr, "\n");
1696       }
1697       break;
1698    }
1699    case nir_op_imax3: {
1700       if (dst.size() == 1) {
1701          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1702       } else {
1703          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1704          nir_print_instr(&instr->instr, stderr);
1705          fprintf(stderr, "\n");
1706       }
1707       break;
1708    }
1709    case nir_op_imin3: {
1710       if (dst.size() == 1) {
1711          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1712       } else {
1713          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1714          nir_print_instr(&instr->instr, stderr);
1715          fprintf(stderr, "\n");
1716       }
1717       break;
1718    }
1719    case nir_op_imed3: {
1720       if (dst.size() == 1) {
1721          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1722       } else {
1723          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1724          nir_print_instr(&instr->instr, stderr);
1725          fprintf(stderr, "\n");
1726       }
1727       break;
1728    }
1729    case nir_op_cube_face_coord: {
1730       Temp in = get_alu_src(ctx, instr->src[0], 3);
1731       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1732                       emit_extract_vector(ctx, in, 1, v1),
1733                       emit_extract_vector(ctx, in, 2, v1) };
1734       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1735       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1736       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1737       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1738       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1739       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1740       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1741       break;
1742    }
1743    case nir_op_cube_face_index: {
1744       Temp in = get_alu_src(ctx, instr->src[0], 3);
1745       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1746                       emit_extract_vector(ctx, in, 1, v1),
1747                       emit_extract_vector(ctx, in, 2, v1) };
1748       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1749       break;
1750    }
1751    case nir_op_bcsel: {
1752       emit_bcsel(ctx, instr, dst);
1753       break;
1754    }
1755    case nir_op_frsq: {
1756       Temp src = get_alu_src(ctx, instr->src[0]);
1757       if (dst.regClass() == v2b) {
1758          Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src);
1759          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1760       } else if (dst.regClass() == v1) {
1761          emit_rsq(ctx, bld, Definition(dst), src);
1762       } else if (dst.regClass() == v2) {
1763          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1764       } else {
1765          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1766          nir_print_instr(&instr->instr, stderr);
1767          fprintf(stderr, "\n");
1768       }
1769       break;
1770    }
1771    case nir_op_fneg: {
1772       Temp src = get_alu_src(ctx, instr->src[0]);
1773       if (dst.regClass() == v2b) {
1774          Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src));
1775          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1776       } else if (dst.regClass() == v1) {
1777          if (ctx->block->fp_mode.must_flush_denorms32)
1778             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1779          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1780       } else if (dst.regClass() == v2) {
1781          if (ctx->block->fp_mode.must_flush_denorms16_64)
1782             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1783          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1784          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1785          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1786          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1787       } else {
1788          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1789          nir_print_instr(&instr->instr, stderr);
1790          fprintf(stderr, "\n");
1791       }
1792       break;
1793    }
1794    case nir_op_fabs: {
1795       Temp src = get_alu_src(ctx, instr->src[0]);
1796       if (dst.regClass() == v2b) {
1797          Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src));
1798          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1799       } else if (dst.regClass() == v1) {
1800          if (ctx->block->fp_mode.must_flush_denorms32)
1801             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1802          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1803       } else if (dst.regClass() == v2) {
1804          if (ctx->block->fp_mode.must_flush_denorms16_64)
1805             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1806          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1807          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1808          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1809          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1810       } else {
1811          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1812          nir_print_instr(&instr->instr, stderr);
1813          fprintf(stderr, "\n");
1814       }
1815       break;
1816    }
1817    case nir_op_fsat: {
1818       Temp src = get_alu_src(ctx, instr->src[0]);
1819       if (dst.regClass() == v2b) {
1820          Temp one = bld.copy(bld.def(s1), Operand(0x3c00u));
1821          Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), one, src);
1822          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1823       } else if (dst.regClass() == v1) {
1824          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1825          /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
1826          // TODO: confirm that this holds under any circumstances
1827       } else if (dst.regClass() == v2) {
1828          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1829          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1830          vop3->clamp = true;
1831       } else {
1832          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1833          nir_print_instr(&instr->instr, stderr);
1834          fprintf(stderr, "\n");
1835       }
1836       break;
1837    }
1838    case nir_op_flog2: {
1839       Temp src = get_alu_src(ctx, instr->src[0]);
1840       if (dst.regClass() == v2b) {
1841          Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src);
1842          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1843       } else if (dst.regClass() == v1) {
1844          emit_log2(ctx, bld, Definition(dst), src);
1845       } else {
1846          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1847          nir_print_instr(&instr->instr, stderr);
1848          fprintf(stderr, "\n");
1849       }
1850       break;
1851    }
1852    case nir_op_frcp: {
1853       Temp src = get_alu_src(ctx, instr->src[0]);
1854       if (dst.regClass() == v2b) {
1855          Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src);
1856          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1857       } else if (dst.regClass() == v1) {
1858          emit_rcp(ctx, bld, Definition(dst), src);
1859       } else if (dst.regClass() == v2) {
1860          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1861       } else {
1862          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1863          nir_print_instr(&instr->instr, stderr);
1864          fprintf(stderr, "\n");
1865       }
1866       break;
1867    }
1868    case nir_op_fexp2: {
1869       if (dst.regClass() == v2b) {
1870          Temp src = get_alu_src(ctx, instr->src[0]);
1871          Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src);
1872          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1873       } else if (dst.regClass() == v1) {
1874          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1875       } else {
1876          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1877          nir_print_instr(&instr->instr, stderr);
1878          fprintf(stderr, "\n");
1879       }
1880       break;
1881    }
1882    case nir_op_fsqrt: {
1883       Temp src = get_alu_src(ctx, instr->src[0]);
1884       if (dst.regClass() == v2b) {
1885          Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src);
1886          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1887       } else if (dst.regClass() == v1) {
1888          emit_sqrt(ctx, bld, Definition(dst), src);
1889       } else if (dst.regClass() == v2) {
1890          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1891       } else {
1892          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1893          nir_print_instr(&instr->instr, stderr);
1894          fprintf(stderr, "\n");
1895       }
1896       break;
1897    }
1898    case nir_op_ffract: {
1899       if (dst.regClass() == v2b) {
1900          Temp src = get_alu_src(ctx, instr->src[0]);
1901          Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src);
1902          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1903       } else if (dst.regClass() == v1) {
1904          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1905       } else if (dst.regClass() == v2) {
1906          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1907       } else {
1908          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1909          nir_print_instr(&instr->instr, stderr);
1910          fprintf(stderr, "\n");
1911       }
1912       break;
1913    }
1914    case nir_op_ffloor: {
1915       Temp src = get_alu_src(ctx, instr->src[0]);
1916       if (dst.regClass() == v2b) {
1917          Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src);
1918          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1919       } else if (dst.regClass() == v1) {
1920          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1921       } else if (dst.regClass() == v2) {
1922          emit_floor_f64(ctx, bld, Definition(dst), src);
1923       } else {
1924          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1925          nir_print_instr(&instr->instr, stderr);
1926          fprintf(stderr, "\n");
1927       }
1928       break;
1929    }
1930    case nir_op_fceil: {
1931       Temp src0 = get_alu_src(ctx, instr->src[0]);
1932       if (dst.regClass() == v2b) {
1933          Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0);
1934          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1935       } else if (dst.regClass() == v1) {
1936          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1937       } else if (dst.regClass() == v2) {
1938          if (ctx->options->chip_class >= GFX7) {
1939             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1940          } else {
1941             /* GFX6 doesn't support V_CEIL_F64, lower it. */
1942             /* trunc = trunc(src0)
1943              * if (src0 > 0.0 && src0 != trunc)
1944              *    trunc += 1.0
1945              */
1946             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
1947             Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
1948             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
1949             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
1950             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
1951             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
1952             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
1953          }
1954       } else {
1955          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1956          nir_print_instr(&instr->instr, stderr);
1957          fprintf(stderr, "\n");
1958       }
1959       break;
1960    }
1961    case nir_op_ftrunc: {
1962       Temp src = get_alu_src(ctx, instr->src[0]);
1963       if (dst.regClass() == v2b) {
1964          Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src);
1965          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1966       } else if (dst.regClass() == v1) {
1967          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1968       } else if (dst.regClass() == v2) {
1969          emit_trunc_f64(ctx, bld, Definition(dst), src);
1970       } else {
1971          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1972          nir_print_instr(&instr->instr, stderr);
1973          fprintf(stderr, "\n");
1974       }
1975       break;
1976    }
1977    case nir_op_fround_even: {
1978       Temp src0 = get_alu_src(ctx, instr->src[0]);
1979       if (dst.regClass() == v2b) {
1980          Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0);
1981          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
1982       } else if (dst.regClass() == v1) {
1983          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1984       } else if (dst.regClass() == v2) {
1985          if (ctx->options->chip_class >= GFX7) {
1986             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1987          } else {
1988             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
1989             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
1990             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
1991
1992             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
1993             Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
1994             Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
1995             Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
1996             static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
1997             tmp = sub->definitions[0].getTemp();
1998
1999             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2000             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2001             static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2002             Temp cond = vop3->definitions[0].getTemp();
2003
2004             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2005             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2006             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2007             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2008
2009             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2010          }
2011       } else {
2012          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2013          nir_print_instr(&instr->instr, stderr);
2014          fprintf(stderr, "\n");
2015       }
2016       break;
2017    }
2018    case nir_op_fsin:
2019    case nir_op_fcos: {
2020       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2021       aco_ptr<Instruction> norm;
2022       Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2023       if (dst.regClass() == v2b) {
2024          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2025          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2026          tmp = bld.vop1(opcode, bld.def(v1), tmp);
2027          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
2028       } else if (dst.regClass() == v1) {
2029          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2030
2031          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2032          if (ctx->options->chip_class < GFX9)
2033             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2034
2035          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2036          bld.vop1(opcode, Definition(dst), tmp);
2037       } else {
2038          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2039          nir_print_instr(&instr->instr, stderr);
2040          fprintf(stderr, "\n");
2041       }
2042       break;
2043    }
2044    case nir_op_ldexp: {
2045       if (dst.size() == 1) {
2046          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
2047                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
2048                   get_alu_src(ctx, instr->src[1]));
2049       } else if (dst.size() == 2) {
2050          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
2051                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
2052                   get_alu_src(ctx, instr->src[1]));
2053       } else {
2054          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2055          nir_print_instr(&instr->instr, stderr);
2056          fprintf(stderr, "\n");
2057       }
2058       break;
2059    }
2060    case nir_op_frexp_sig: {
2061       Temp src = get_alu_src(ctx, instr->src[0]);
2062       if (dst.regClass() == v2b) {
2063          Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src);
2064          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
2065       } else if (dst.regClass() == v1) {
2066          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2067       } else if (dst.regClass() == v2) {
2068          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2069       } else {
2070          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2071          nir_print_instr(&instr->instr, stderr);
2072          fprintf(stderr, "\n");
2073       }
2074       break;
2075    }
2076    case nir_op_frexp_exp: {
2077       Temp src = get_alu_src(ctx, instr->src[0]);
2078       if (instr->src[0].src.ssa->bit_size == 16) {
2079          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2080          bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), tmp, Operand(0u));
2081       } else if (instr->src[0].src.ssa->bit_size == 32) {
2082          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2083       } else if (instr->src[0].src.ssa->bit_size == 64) {
2084          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2085       } else {
2086          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2087          nir_print_instr(&instr->instr, stderr);
2088          fprintf(stderr, "\n");
2089       }
2090       break;
2091    }
2092    case nir_op_fsign: {
2093       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2094       if (dst.size() == 1) {
2095          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2096          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2097          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2098          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2099       } else if (dst.size() == 2) {
2100          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2101          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2102          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2103
2104          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2105          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2106          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2107
2108          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2109       } else {
2110          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2111          nir_print_instr(&instr->instr, stderr);
2112          fprintf(stderr, "\n");
2113       }
2114       break;
2115    }
2116    case nir_op_f2f16:
2117    case nir_op_f2f16_rtne: {
2118       Temp src = get_alu_src(ctx, instr->src[0]);
2119       if (instr->src[0].src.ssa->bit_size == 64)
2120          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2121       src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2122       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
2123       break;
2124    }
2125    case nir_op_f2f16_rtz: {
2126       Temp src = get_alu_src(ctx, instr->src[0]);
2127       if (instr->src[0].src.ssa->bit_size == 64)
2128          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2129       src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u));
2130       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
2131       break;
2132    }
2133    case nir_op_f2f32: {
2134       if (instr->src[0].src.ssa->bit_size == 16) {
2135          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2136       } else if (instr->src[0].src.ssa->bit_size == 64) {
2137          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2138       } else {
2139          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2140          nir_print_instr(&instr->instr, stderr);
2141          fprintf(stderr, "\n");
2142       }
2143       break;
2144    }
2145    case nir_op_f2f64: {
2146       Temp src = get_alu_src(ctx, instr->src[0]);
2147       if (instr->src[0].src.ssa->bit_size == 16)
2148          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2149       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2150       break;
2151    }
2152    case nir_op_i2f32: {
2153       assert(dst.size() == 1);
2154       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
2155       break;
2156    }
2157    case nir_op_i2f64: {
2158       if (instr->src[0].src.ssa->bit_size == 32) {
2159          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
2160       } else if (instr->src[0].src.ssa->bit_size == 64) {
2161          Temp src = get_alu_src(ctx, instr->src[0]);
2162          RegClass rc = RegClass(src.type(), 1);
2163          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2164          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2165          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2166          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2167          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2168          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2169
2170       } else {
2171          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2172          nir_print_instr(&instr->instr, stderr);
2173          fprintf(stderr, "\n");
2174       }
2175       break;
2176    }
2177    case nir_op_u2f32: {
2178       assert(dst.size() == 1);
2179       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
2180       break;
2181    }
2182    case nir_op_u2f64: {
2183       if (instr->src[0].src.ssa->bit_size == 32) {
2184          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
2185       } else if (instr->src[0].src.ssa->bit_size == 64) {
2186          Temp src = get_alu_src(ctx, instr->src[0]);
2187          RegClass rc = RegClass(src.type(), 1);
2188          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2189          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2190          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2191          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2192          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2193          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2194       } else {
2195          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2196          nir_print_instr(&instr->instr, stderr);
2197          fprintf(stderr, "\n");
2198       }
2199       break;
2200    }
2201    case nir_op_f2i16: {
2202       Temp src = get_alu_src(ctx, instr->src[0]);
2203       if (instr->src[0].src.ssa->bit_size == 16)
2204          src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
2205       else if (instr->src[0].src.ssa->bit_size == 32)
2206          src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
2207       else
2208          src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
2209
2210       if (dst.type() == RegType::vgpr)
2211          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
2212       else
2213          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
2214       break;
2215    }
2216    case nir_op_f2u16: {
2217       Temp src = get_alu_src(ctx, instr->src[0]);
2218       if (instr->src[0].src.ssa->bit_size == 16)
2219          src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
2220       else if (instr->src[0].src.ssa->bit_size == 32)
2221          src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
2222       else
2223          src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
2224
2225       if (dst.type() == RegType::vgpr)
2226          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
2227       else
2228          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
2229       break;
2230    }
2231    case nir_op_f2i32: {
2232       Temp src = get_alu_src(ctx, instr->src[0]);
2233       if (instr->src[0].src.ssa->bit_size == 32) {
2234          if (dst.type() == RegType::vgpr)
2235             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
2236          else
2237             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2238                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
2239
2240       } else if (instr->src[0].src.ssa->bit_size == 64) {
2241          if (dst.type() == RegType::vgpr)
2242             bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
2243          else
2244             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2245                        bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
2246
2247       } else {
2248          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2249          nir_print_instr(&instr->instr, stderr);
2250          fprintf(stderr, "\n");
2251       }
2252       break;
2253    }
2254    case nir_op_f2u32: {
2255       Temp src = get_alu_src(ctx, instr->src[0]);
2256       if (instr->src[0].src.ssa->bit_size == 32) {
2257          if (dst.type() == RegType::vgpr)
2258             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
2259          else
2260             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2261                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
2262
2263       } else if (instr->src[0].src.ssa->bit_size == 64) {
2264          if (dst.type() == RegType::vgpr)
2265             bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
2266          else
2267             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2268                        bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
2269
2270       } else {
2271          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2272          nir_print_instr(&instr->instr, stderr);
2273          fprintf(stderr, "\n");
2274       }
2275       break;
2276    }
2277    case nir_op_f2i64: {
2278       Temp src = get_alu_src(ctx, instr->src[0]);
2279       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
2280          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2281          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
2282          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2283          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2284          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2285          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
2286          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2287          Temp new_exponent = bld.tmp(v1);
2288          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
2289          if (ctx->program->chip_class >= GFX8)
2290             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2291          else
2292             mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2293          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
2294          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2295          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2296          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
2297          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2298          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2299          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2300          Temp new_lower = bld.tmp(v1);
2301          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2302          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2303          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2304
2305       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
2306          if (src.type() == RegType::vgpr)
2307             src = bld.as_uniform(src);
2308          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2309          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2310          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2311          exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
2312          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2313          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2314          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2315          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
2316          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2317          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
2318          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2319          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
2320          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
2321          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2322          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2323          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2324          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2325          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2326          Temp borrow = bld.tmp(s1);
2327          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2328          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
2329          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2330
2331       } else if (instr->src[0].src.ssa->bit_size == 64) {
2332          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2333          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2334          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2335          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2336          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2337          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2338          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2339          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2340          if (dst.type() == RegType::sgpr) {
2341             lower = bld.as_uniform(lower);
2342             upper = bld.as_uniform(upper);
2343          }
2344          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2345
2346       } else {
2347          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2348          nir_print_instr(&instr->instr, stderr);
2349          fprintf(stderr, "\n");
2350       }
2351       break;
2352    }
2353    case nir_op_f2u64: {
2354       Temp src = get_alu_src(ctx, instr->src[0]);
2355       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
2356          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2357          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
2358          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
2359          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2360          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2361          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
2362          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2363          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2364          Temp new_exponent = bld.tmp(v1);
2365          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
2366          if (ctx->program->chip_class >= GFX8)
2367             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2368          else
2369             mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2370          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2371          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2372          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2373          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
2374          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
2375          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
2376          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2377
2378       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
2379          if (src.type() == RegType::vgpr)
2380             src = bld.as_uniform(src);
2381          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2382          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2383          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2384          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2385          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2386          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2387          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2388          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2389          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2390          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2391          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2392          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2393          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2394          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2395          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2396          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2397          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2398          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2399
2400       } else if (instr->src[0].src.ssa->bit_size == 64) {
2401          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2402          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2403          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2404          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2405          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2406          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2407          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2408          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2409          if (dst.type() == RegType::sgpr) {
2410             lower = bld.as_uniform(lower);
2411             upper = bld.as_uniform(upper);
2412          }
2413          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2414
2415       } else {
2416          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2417          nir_print_instr(&instr->instr, stderr);
2418          fprintf(stderr, "\n");
2419       }
2420       break;
2421    }
2422    case nir_op_b2f32: {
2423       Temp src = get_alu_src(ctx, instr->src[0]);
2424       assert(src.regClass() == bld.lm);
2425
2426       if (dst.regClass() == s1) {
2427          src = bool_to_scalar_condition(ctx, src);
2428          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2429       } else if (dst.regClass() == v1) {
2430          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2431       } else {
2432          unreachable("Wrong destination register class for nir_op_b2f32.");
2433       }
2434       break;
2435    }
2436    case nir_op_b2f64: {
2437       Temp src = get_alu_src(ctx, instr->src[0]);
2438       assert(src.regClass() == bld.lm);
2439
2440       if (dst.regClass() == s2) {
2441          src = bool_to_scalar_condition(ctx, src);
2442          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2443       } else if (dst.regClass() == v2) {
2444          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2445          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2446          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2447       } else {
2448          unreachable("Wrong destination register class for nir_op_b2f64.");
2449       }
2450       break;
2451    }
2452    case nir_op_i2i8:
2453    case nir_op_u2u8: {
2454       Temp src = get_alu_src(ctx, instr->src[0]);
2455       /* we can actually just say dst = src */
2456       if (src.regClass() == s1)
2457          bld.copy(Definition(dst), src);
2458       else
2459          emit_extract_vector(ctx, src, 0, dst);
2460       break;
2461    }
2462    case nir_op_i2i16: {
2463       Temp src = get_alu_src(ctx, instr->src[0]);
2464       if (instr->src[0].src.ssa->bit_size == 8) {
2465          if (dst.regClass() == s1) {
2466             bld.sop1(aco_opcode::s_sext_i32_i8, Definition(dst), Operand(src));
2467          } else {
2468             assert(src.regClass() == v1b);
2469             aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
2470             sdwa->operands[0] = Operand(src);
2471             sdwa->definitions[0] = Definition(dst);
2472             sdwa->sel[0] = sdwa_sbyte;
2473             sdwa->dst_sel = sdwa_sword;
2474             ctx->block->instructions.emplace_back(std::move(sdwa));
2475          }
2476       } else {
2477          Temp src = get_alu_src(ctx, instr->src[0]);
2478          /* we can actually just say dst = src */
2479          if (src.regClass() == s1)
2480             bld.copy(Definition(dst), src);
2481          else
2482             emit_extract_vector(ctx, src, 0, dst);
2483       }
2484       break;
2485    }
2486    case nir_op_u2u16: {
2487       Temp src = get_alu_src(ctx, instr->src[0]);
2488       if (instr->src[0].src.ssa->bit_size == 8) {
2489          if (dst.regClass() == s1)
2490             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFu), src);
2491          else {
2492             assert(src.regClass() == v1b);
2493             aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
2494             sdwa->operands[0] = Operand(src);
2495             sdwa->definitions[0] = Definition(dst);
2496             sdwa->sel[0] = sdwa_ubyte;
2497             sdwa->dst_sel = sdwa_uword;
2498             ctx->block->instructions.emplace_back(std::move(sdwa));
2499          }
2500       } else {
2501          Temp src = get_alu_src(ctx, instr->src[0]);
2502          /* we can actually just say dst = src */
2503          if (src.regClass() == s1)
2504             bld.copy(Definition(dst), src);
2505          else
2506             emit_extract_vector(ctx, src, 0, dst);
2507       }
2508       break;
2509    }
2510    case nir_op_i2i32: {
2511       Temp src = get_alu_src(ctx, instr->src[0]);
2512       if (instr->src[0].src.ssa->bit_size == 8) {
2513          if (dst.regClass() == s1) {
2514             bld.sop1(aco_opcode::s_sext_i32_i8, Definition(dst), Operand(src));
2515          } else {
2516             assert(src.regClass() == v1b);
2517             aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
2518             sdwa->operands[0] = Operand(src);
2519             sdwa->definitions[0] = Definition(dst);
2520             sdwa->sel[0] = sdwa_sbyte;
2521             sdwa->dst_sel = sdwa_sdword;
2522             ctx->block->instructions.emplace_back(std::move(sdwa));
2523          }
2524       } else if (instr->src[0].src.ssa->bit_size == 16) {
2525          if (dst.regClass() == s1) {
2526             bld.sop1(aco_opcode::s_sext_i32_i16, Definition(dst), Operand(src));
2527          } else {
2528             assert(src.regClass() == v2b);
2529             aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
2530             sdwa->operands[0] = Operand(src);
2531             sdwa->definitions[0] = Definition(dst);
2532             sdwa->sel[0] = sdwa_sword;
2533             sdwa->dst_sel = sdwa_udword;
2534             ctx->block->instructions.emplace_back(std::move(sdwa));
2535          }
2536       } else if (instr->src[0].src.ssa->bit_size == 64) {
2537          /* we can actually just say dst = src, as it would map the lower register */
2538          emit_extract_vector(ctx, src, 0, dst);
2539       } else {
2540          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2541          nir_print_instr(&instr->instr, stderr);
2542          fprintf(stderr, "\n");
2543       }
2544       break;
2545    }
2546    case nir_op_u2u32: {
2547       Temp src = get_alu_src(ctx, instr->src[0]);
2548       if (instr->src[0].src.ssa->bit_size == 8) {
2549          if (dst.regClass() == s1)
2550             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFu), src);
2551          else {
2552             assert(src.regClass() == v1b);
2553             aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
2554             sdwa->operands[0] = Operand(src);
2555             sdwa->definitions[0] = Definition(dst);
2556             sdwa->sel[0] = sdwa_ubyte;
2557             sdwa->dst_sel = sdwa_udword;
2558             ctx->block->instructions.emplace_back(std::move(sdwa));
2559          }
2560       } else if (instr->src[0].src.ssa->bit_size == 16) {
2561          if (dst.regClass() == s1) {
2562             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
2563          } else {
2564             assert(src.regClass() == v2b);
2565             aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
2566             sdwa->operands[0] = Operand(src);
2567             sdwa->definitions[0] = Definition(dst);
2568             sdwa->sel[0] = sdwa_uword;
2569             sdwa->dst_sel = sdwa_udword;
2570             ctx->block->instructions.emplace_back(std::move(sdwa));
2571          }
2572       } else if (instr->src[0].src.ssa->bit_size == 64) {
2573          /* we can actually just say dst = src, as it would map the lower register */
2574          emit_extract_vector(ctx, src, 0, dst);
2575       } else {
2576          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2577          nir_print_instr(&instr->instr, stderr);
2578          fprintf(stderr, "\n");
2579       }
2580       break;
2581    }
2582    case nir_op_i2i64: {
2583       Temp src = get_alu_src(ctx, instr->src[0]);
2584       if (src.regClass() == s1) {
2585          Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2586          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2587       } else if (src.regClass() == v1) {
2588          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2589          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, high);
2590       } else {
2591          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2592          nir_print_instr(&instr->instr, stderr);
2593          fprintf(stderr, "\n");
2594       }
2595       break;
2596    }
2597    case nir_op_u2u64: {
2598       Temp src = get_alu_src(ctx, instr->src[0]);
2599       if (instr->src[0].src.ssa->bit_size == 32) {
2600          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
2601       } else {
2602          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2603          nir_print_instr(&instr->instr, stderr);
2604          fprintf(stderr, "\n");
2605       }
2606       break;
2607    }
2608    case nir_op_b2b32:
2609    case nir_op_b2i32: {
2610       Temp src = get_alu_src(ctx, instr->src[0]);
2611       assert(src.regClass() == bld.lm);
2612
2613       if (dst.regClass() == s1) {
2614          // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2615          bool_to_scalar_condition(ctx, src, dst);
2616       } else if (dst.regClass() == v1) {
2617          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2618       } else {
2619          unreachable("Invalid register class for b2i32");
2620       }
2621       break;
2622    }
2623    case nir_op_b2b1:
2624    case nir_op_i2b1: {
2625       Temp src = get_alu_src(ctx, instr->src[0]);
2626       assert(dst.regClass() == bld.lm);
2627
2628       if (src.type() == RegType::vgpr) {
2629          assert(src.regClass() == v1 || src.regClass() == v2);
2630          assert(dst.regClass() == bld.lm);
2631          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2632                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2633       } else {
2634          assert(src.regClass() == s1 || src.regClass() == s2);
2635          Temp tmp;
2636          if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2637             tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2638          } else {
2639             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2640                            bld.scc(bld.def(s1)), Operand(0u), src);
2641          }
2642          bool_to_vector_condition(ctx, tmp, dst);
2643       }
2644       break;
2645    }
2646    case nir_op_pack_64_2x32_split: {
2647       Temp src0 = get_alu_src(ctx, instr->src[0]);
2648       Temp src1 = get_alu_src(ctx, instr->src[1]);
2649
2650       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2651       break;
2652    }
2653    case nir_op_unpack_64_2x32_split_x:
2654       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2655       break;
2656    case nir_op_unpack_64_2x32_split_y:
2657       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2658       break;
2659    case nir_op_unpack_32_2x16_split_x:
2660       if (dst.type() == RegType::vgpr) {
2661          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2662       } else {
2663          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
2664       }
2665       break;
2666    case nir_op_unpack_32_2x16_split_y:
2667       if (dst.type() == RegType::vgpr) {
2668          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2669       } else {
2670          bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16)));
2671       }
2672       break;
2673    case nir_op_pack_32_2x16_split: {
2674       Temp src0 = get_alu_src(ctx, instr->src[0]);
2675       Temp src1 = get_alu_src(ctx, instr->src[1]);
2676       if (dst.regClass() == v1) {
2677          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2678       } else {
2679          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
2680          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
2681          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
2682       }
2683       break;
2684    }
2685    case nir_op_pack_half_2x16: {
2686       Temp src = get_alu_src(ctx, instr->src[0], 2);
2687
2688       if (dst.regClass() == v1) {
2689          Temp src0 = bld.tmp(v1);
2690          Temp src1 = bld.tmp(v1);
2691          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2692          if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2693             bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2694          else
2695             bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2696                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2697                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2698       } else {
2699          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2700          nir_print_instr(&instr->instr, stderr);
2701          fprintf(stderr, "\n");
2702       }
2703       break;
2704    }
2705    case nir_op_unpack_half_2x16_split_x: {
2706       if (dst.regClass() == v1) {
2707          Builder bld(ctx->program, ctx->block);
2708          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2709       } else {
2710          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2711          nir_print_instr(&instr->instr, stderr);
2712          fprintf(stderr, "\n");
2713       }
2714       break;
2715    }
2716    case nir_op_unpack_half_2x16_split_y: {
2717       if (dst.regClass() == v1) {
2718          Builder bld(ctx->program, ctx->block);
2719          /* TODO: use SDWA here */
2720          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2721                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2722       } else {
2723          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2724          nir_print_instr(&instr->instr, stderr);
2725          fprintf(stderr, "\n");
2726       }
2727       break;
2728    }
2729    case nir_op_fquantize2f16: {
2730       Temp src = get_alu_src(ctx, instr->src[0]);
2731       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2732       Temp f32, cmp_res;
2733
2734       if (ctx->program->chip_class >= GFX8) {
2735          Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2736          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2737          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2738       } else {
2739          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2740           * so compare the result and flush to 0 if it's smaller.
2741           */
2742          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2743          Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2744          Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
2745          static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2746          cmp_res = vop3->definitions[0].getTemp();
2747       }
2748
2749       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2750          Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2751          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2752       } else {
2753          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2754       }
2755       break;
2756    }
2757    case nir_op_bfm: {
2758       Temp bits = get_alu_src(ctx, instr->src[0]);
2759       Temp offset = get_alu_src(ctx, instr->src[1]);
2760
2761       if (dst.regClass() == s1) {
2762          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2763       } else if (dst.regClass() == v1) {
2764          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2765       } else {
2766          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2767          nir_print_instr(&instr->instr, stderr);
2768          fprintf(stderr, "\n");
2769       }
2770       break;
2771    }
2772    case nir_op_bitfield_select: {
2773       /* (mask & insert) | (~mask & base) */
2774       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2775       Temp insert = get_alu_src(ctx, instr->src[1]);
2776       Temp base = get_alu_src(ctx, instr->src[2]);
2777
2778       /* dst = (insert & bitmask) | (base & ~bitmask) */
2779       if (dst.regClass() == s1) {
2780          aco_ptr<Instruction> sop2;
2781          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2782          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2783          Operand lhs;
2784          if (const_insert && const_bitmask) {
2785             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2786          } else {
2787             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2788             lhs = Operand(insert);
2789          }
2790
2791          Operand rhs;
2792          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2793          if (const_base && const_bitmask) {
2794             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2795          } else {
2796             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2797             rhs = Operand(base);
2798          }
2799
2800          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2801
2802       } else if (dst.regClass() == v1) {
2803          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2804             base = as_vgpr(ctx, base);
2805          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2806             insert = as_vgpr(ctx, insert);
2807
2808          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2809
2810       } else {
2811          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2812          nir_print_instr(&instr->instr, stderr);
2813          fprintf(stderr, "\n");
2814       }
2815       break;
2816    }
2817    case nir_op_ubfe:
2818    case nir_op_ibfe: {
2819       Temp base = get_alu_src(ctx, instr->src[0]);
2820       Temp offset = get_alu_src(ctx, instr->src[1]);
2821       Temp bits = get_alu_src(ctx, instr->src[2]);
2822
2823       if (dst.type() == RegType::sgpr) {
2824          Operand extract;
2825          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2826          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2827          if (const_offset && const_bits) {
2828             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2829             extract = Operand(const_extract);
2830          } else {
2831             Operand width;
2832             if (const_bits) {
2833                width = Operand(const_bits->u32 << 16);
2834             } else {
2835                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2836             }
2837             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2838          }
2839
2840          aco_opcode opcode;
2841          if (dst.regClass() == s1) {
2842             if (instr->op == nir_op_ubfe)
2843                opcode = aco_opcode::s_bfe_u32;
2844             else
2845                opcode = aco_opcode::s_bfe_i32;
2846          } else if (dst.regClass() == s2) {
2847             if (instr->op == nir_op_ubfe)
2848                opcode = aco_opcode::s_bfe_u64;
2849             else
2850                opcode = aco_opcode::s_bfe_i64;
2851          } else {
2852             unreachable("Unsupported BFE bit size");
2853          }
2854
2855          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2856
2857       } else {
2858          aco_opcode opcode;
2859          if (dst.regClass() == v1) {
2860             if (instr->op == nir_op_ubfe)
2861                opcode = aco_opcode::v_bfe_u32;
2862             else
2863                opcode = aco_opcode::v_bfe_i32;
2864          } else {
2865             unreachable("Unsupported BFE bit size");
2866          }
2867
2868          emit_vop3a_instruction(ctx, instr, opcode, dst);
2869       }
2870       break;
2871    }
2872    case nir_op_bit_count: {
2873       Temp src = get_alu_src(ctx, instr->src[0]);
2874       if (src.regClass() == s1) {
2875          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2876       } else if (src.regClass() == v1) {
2877          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2878       } else if (src.regClass() == v2) {
2879          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2880                   emit_extract_vector(ctx, src, 1, v1),
2881                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2882                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2883       } else if (src.regClass() == s2) {
2884          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2885       } else {
2886          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2887          nir_print_instr(&instr->instr, stderr);
2888          fprintf(stderr, "\n");
2889       }
2890       break;
2891    }
2892    case nir_op_flt: {
2893       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2894       break;
2895    }
2896    case nir_op_fge: {
2897       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2898       break;
2899    }
2900    case nir_op_feq: {
2901       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2902       break;
2903    }
2904    case nir_op_fne: {
2905       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2906       break;
2907    }
2908    case nir_op_ilt: {
2909       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2910       break;
2911    }
2912    case nir_op_ige: {
2913       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
2914       break;
2915    }
2916    case nir_op_ieq: {
2917       if (instr->src[0].src.ssa->bit_size == 1)
2918          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
2919       else
2920          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
2921                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
2922       break;
2923    }
2924    case nir_op_ine: {
2925       if (instr->src[0].src.ssa->bit_size == 1)
2926          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
2927       else
2928          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
2929                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
2930       break;
2931    }
2932    case nir_op_ult: {
2933       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
2934       break;
2935    }
2936    case nir_op_uge: {
2937       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
2938       break;
2939    }
2940    case nir_op_fddx:
2941    case nir_op_fddy:
2942    case nir_op_fddx_fine:
2943    case nir_op_fddy_fine:
2944    case nir_op_fddx_coarse:
2945    case nir_op_fddy_coarse: {
2946       Temp src = get_alu_src(ctx, instr->src[0]);
2947       uint16_t dpp_ctrl1, dpp_ctrl2;
2948       if (instr->op == nir_op_fddx_fine) {
2949          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
2950          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
2951       } else if (instr->op == nir_op_fddy_fine) {
2952          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
2953          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
2954       } else {
2955          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
2956          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2957             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
2958          else
2959             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
2960       }
2961
2962       Temp tmp;
2963       if (ctx->program->chip_class >= GFX8) {
2964          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
2965          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
2966       } else {
2967          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
2968          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
2969          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
2970       }
2971       emit_wqm(ctx, tmp, dst, true);
2972       break;
2973    }
2974    default:
2975       fprintf(stderr, "Unknown NIR ALU instr: ");
2976       nir_print_instr(&instr->instr, stderr);
2977       fprintf(stderr, "\n");
2978    }
2979 }
2980
2981 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2982 {
2983    Temp dst = get_ssa_temp(ctx, &instr->def);
2984
2985    // TODO: we really want to have the resulting type as this would allow for 64bit literals
2986    // which get truncated the lsb if double and msb if int
2987    // for now, we only use s_mov_b64 with 64bit inline constants
2988    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2989    assert(dst.type() == RegType::sgpr);
2990
2991    Builder bld(ctx->program, ctx->block);
2992
2993    if (instr->def.bit_size == 1) {
2994       assert(dst.regClass() == bld.lm);
2995       int val = instr->value[0].b ? -1 : 0;
2996       Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
2997       bld.sop1(Builder::s_mov, Definition(dst), op);
2998    } else if (dst.size() == 1) {
2999       bld.copy(Definition(dst), Operand(instr->value[0].u32));
3000    } else {
3001       assert(dst.size() != 1);
3002       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3003       if (instr->def.bit_size == 64)
3004          for (unsigned i = 0; i < dst.size(); i++)
3005             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
3006       else {
3007          for (unsigned i = 0; i < dst.size(); i++)
3008             vec->operands[i] = Operand{instr->value[i].u32};
3009       }
3010       vec->definitions[0] = Definition(dst);
3011       ctx->block->instructions.emplace_back(std::move(vec));
3012    }
3013 }
3014
3015 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
3016 {
3017    uint32_t new_mask = 0;
3018    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3019       if (mask & (1u << i))
3020          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3021    return new_mask;
3022 }
3023
3024 Operand load_lds_size_m0(isel_context *ctx)
3025 {
3026    /* TODO: m0 does not need to be initialized on GFX9+ */
3027    Builder bld(ctx->program, ctx->block);
3028    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
3029 }
3030
3031 Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
3032               Temp address, unsigned base_offset, unsigned align)
3033 {
3034    assert(util_is_power_of_two_nonzero(align) && align >= 4);
3035
3036    Builder bld(ctx->program, ctx->block);
3037
3038    Operand m = load_lds_size_m0(ctx);
3039
3040    unsigned num_components = dst.size() * 4u / elem_size_bytes;
3041    unsigned bytes_read = 0;
3042    unsigned result_size = 0;
3043    unsigned total_bytes = num_components * elem_size_bytes;
3044    std::array<Temp, NIR_MAX_VEC_COMPONENTS> result;
3045    bool large_ds_read = ctx->options->chip_class >= GFX7;
3046    bool usable_read2 = ctx->options->chip_class >= GFX7;
3047
3048    while (bytes_read < total_bytes) {
3049       unsigned todo = total_bytes - bytes_read;
3050       bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
3051       bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
3052
3053       aco_opcode op = aco_opcode::last_opcode;
3054       bool read2 = false;
3055       if (todo >= 16 && aligned16 && large_ds_read) {
3056          op = aco_opcode::ds_read_b128;
3057          todo = 16;
3058       } else if (todo >= 16 && aligned8 && usable_read2) {
3059          op = aco_opcode::ds_read2_b64;
3060          read2 = true;
3061          todo = 16;
3062       } else if (todo >= 12 && aligned16 && large_ds_read) {
3063          op = aco_opcode::ds_read_b96;
3064          todo = 12;
3065       } else if (todo >= 8 && aligned8) {
3066          op = aco_opcode::ds_read_b64;
3067          todo = 8;
3068       } else if (todo >= 8 && usable_read2) {
3069          op = aco_opcode::ds_read2_b32;
3070          read2 = true;
3071          todo = 8;
3072       } else if (todo >= 4) {
3073          op = aco_opcode::ds_read_b32;
3074          todo = 4;
3075       } else {
3076          assert(false);
3077       }
3078       assert(todo % elem_size_bytes == 0);
3079       unsigned num_elements = todo / elem_size_bytes;
3080       unsigned offset = base_offset + bytes_read;
3081       unsigned max_offset = read2 ? 1019 : 65535;
3082
3083       Temp address_offset = address;
3084       if (offset > max_offset) {
3085          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
3086          offset = bytes_read;
3087       }
3088       assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
3089
3090       Temp res;
3091       if (num_components == 1 && dst.type() == RegType::vgpr)
3092          res = dst;
3093       else
3094          res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
3095
3096       if (read2)
3097          res = bld.ds(op, Definition(res), address_offset, m, offset / (todo / 2), (offset / (todo / 2)) + 1);
3098       else
3099          res = bld.ds(op, Definition(res), address_offset, m, offset);
3100
3101       if (num_components == 1) {
3102          assert(todo == total_bytes);
3103          if (dst.type() == RegType::sgpr)
3104             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
3105          return dst;
3106       }
3107
3108       if (dst.type() == RegType::sgpr) {
3109          Temp new_res = bld.tmp(RegType::sgpr, res.size());
3110          expand_vector(ctx, res, new_res, res.size(), (1 << res.size()) - 1);
3111          res = new_res;
3112       }
3113
3114       if (num_elements == 1) {
3115          result[result_size++] = res;
3116       } else {
3117          assert(res != dst && res.size() % num_elements == 0);
3118          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
3119          split->operands[0] = Operand(res);
3120          for (unsigned i = 0; i < num_elements; i++)
3121             split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
3122          ctx->block->instructions.emplace_back(std::move(split));
3123       }
3124
3125       bytes_read += todo;
3126    }
3127
3128    assert(result_size == num_components && result_size > 1);
3129    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
3130    for (unsigned i = 0; i < result_size; i++)
3131       vec->operands[i] = Operand(result[i]);
3132    vec->definitions[0] = Definition(dst);
3133    ctx->block->instructions.emplace_back(std::move(vec));
3134    ctx->allocated_vec.emplace(dst.id(), result);
3135
3136    return dst;
3137 }
3138
3139 Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
3140 {
3141    if (start == 0 && size == data.size())
3142       return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
3143
3144    unsigned size_hint = 1;
3145    auto it = ctx->allocated_vec.find(data.id());
3146    if (it != ctx->allocated_vec.end())
3147       size_hint = it->second[0].size();
3148    if (size % size_hint || start % size_hint)
3149       size_hint = 1;
3150
3151    start /= size_hint;
3152    size /= size_hint;
3153
3154    Temp elems[size];
3155    for (unsigned i = 0; i < size; i++)
3156       elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
3157
3158    if (size == 1)
3159       return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
3160
3161    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
3162    for (unsigned i = 0; i < size; i++)
3163       vec->operands[i] = Operand(elems[i]);
3164    Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
3165    vec->definitions[0] = Definition(res);
3166    ctx->block->instructions.emplace_back(std::move(vec));
3167    return res;
3168 }
3169
3170 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
3171 {
3172    Builder bld(ctx->program, ctx->block);
3173    unsigned bytes_written = 0;
3174    bool large_ds_write = ctx->options->chip_class >= GFX7;
3175    bool usable_write2 = ctx->options->chip_class >= GFX7;
3176
3177    while (bytes_written < total_size * 4) {
3178       unsigned todo = total_size * 4 - bytes_written;
3179       bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
3180       bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
3181
3182       aco_opcode op = aco_opcode::last_opcode;
3183       bool write2 = false;
3184       unsigned size = 0;
3185       if (todo >= 16 && aligned16 && large_ds_write) {
3186          op = aco_opcode::ds_write_b128;
3187          size = 4;
3188       } else if (todo >= 16 && aligned8 && usable_write2) {
3189          op = aco_opcode::ds_write2_b64;
3190          write2 = true;
3191          size = 4;
3192       } else if (todo >= 12 && aligned16 && large_ds_write) {
3193          op = aco_opcode::ds_write_b96;
3194          size = 3;
3195       } else if (todo >= 8 && aligned8) {
3196          op = aco_opcode::ds_write_b64;
3197          size = 2;
3198       } else if (todo >= 8 && usable_write2) {
3199          op = aco_opcode::ds_write2_b32;
3200          write2 = true;
3201          size = 2;
3202       } else if (todo >= 4) {
3203          op = aco_opcode::ds_write_b32;
3204          size = 1;
3205       } else {
3206          assert(false);
3207       }
3208
3209       unsigned offset = offset0 + offset1 + bytes_written;
3210       unsigned max_offset = write2 ? 1020 : 65535;
3211       Temp address_offset = address;
3212       if (offset > max_offset) {
3213          address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
3214          offset = offset1 + bytes_written;
3215       }
3216       assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
3217
3218       if (write2) {
3219          Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
3220          Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
3221          bld.ds(op, address_offset, val0, val1, m, offset / size / 2, (offset / size / 2) + 1);
3222       } else {
3223          Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
3224          bld.ds(op, address_offset, val, m, offset);
3225       }
3226
3227       bytes_written += size * 4;
3228    }
3229 }
3230
3231 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
3232                Temp address, unsigned base_offset, unsigned align)
3233 {
3234    assert(util_is_power_of_two_nonzero(align) && align >= 4);
3235    assert(elem_size_bytes == 4 || elem_size_bytes == 8);
3236
3237    Operand m = load_lds_size_m0(ctx);
3238
3239    /* we need at most two stores, assuming that the writemask is at most 4 bits wide */
3240    assert(wrmask <= 0x0f);
3241    int start[2], count[2];
3242    u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
3243    u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
3244    assert(wrmask == 0);
3245
3246    /* one combined store is sufficient */
3247    if (count[0] == count[1] && (align % elem_size_bytes) == 0 && (base_offset % elem_size_bytes) == 0) {
3248       Builder bld(ctx->program, ctx->block);
3249
3250       Temp address_offset = address;
3251       if ((base_offset / elem_size_bytes) + start[1] > 255) {
3252          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
3253          base_offset = 0;
3254       }
3255
3256       assert(count[0] == 1);
3257       RegClass xtract_rc(RegType::vgpr, elem_size_bytes / 4);
3258
3259       Temp val0 = emit_extract_vector(ctx, data, start[0], xtract_rc);
3260       Temp val1 = emit_extract_vector(ctx, data, start[1], xtract_rc);
3261       aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
3262       base_offset = base_offset / elem_size_bytes;
3263       bld.ds(op, address_offset, val0, val1, m,
3264              base_offset + start[0], base_offset + start[1]);
3265       return;
3266    }
3267
3268    for (unsigned i = 0; i < 2; i++) {
3269       if (count[i] == 0)
3270          continue;
3271
3272       unsigned elem_size_words = elem_size_bytes / 4;
3273       ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
3274                       base_offset, start[i] * elem_size_bytes, align);
3275    }
3276    return;
3277 }
3278
3279 unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
3280 {
3281    unsigned align = 16;
3282    if (const_offset)
3283       align = std::min(align, 1u << (ffs(const_offset) - 1));
3284
3285    return align;
3286 }
3287
3288
3289 Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
3290                            unsigned split_cnt = 0u, Temp dst = Temp())
3291 {
3292    Builder bld(ctx->program, ctx->block);
3293    unsigned dword_size = elem_size_bytes / 4;
3294
3295    if (!dst.id())
3296       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
3297
3298    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3299    aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
3300    instr->definitions[0] = Definition(dst);
3301
3302    for (unsigned i = 0; i < cnt; ++i) {
3303       if (arr[i].id()) {
3304          assert(arr[i].size() == dword_size);
3305          allocated_vec[i] = arr[i];
3306          instr->operands[i] = Operand(arr[i]);
3307       } else {
3308          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
3309          allocated_vec[i] = zero;
3310          instr->operands[i] = Operand(zero);
3311       }
3312    }
3313
3314    bld.insert(std::move(instr));
3315
3316    if (split_cnt)
3317       emit_split_vector(ctx, dst, split_cnt);
3318    else
3319       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
3320
3321    return dst;
3322 }
3323
3324 inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
3325 {
3326    if (const_offset >= 4096) {
3327       unsigned excess_const_offset = const_offset / 4096u * 4096u;
3328       const_offset %= 4096u;
3329
3330       if (!voffset.id())
3331          voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
3332       else if (unlikely(voffset.regClass() == s1))
3333          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
3334       else if (likely(voffset.regClass() == v1))
3335          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
3336       else
3337          unreachable("Unsupported register class of voffset");
3338    }
3339
3340    return const_offset;
3341 }
3342
3343 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
3344                              unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
3345 {
3346    assert(vdata.id());
3347    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
3348    assert(vdata.size() >= 1 && vdata.size() <= 4);
3349
3350    Builder bld(ctx->program, ctx->block);
3351    aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
3352    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
3353
3354    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
3355    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
3356    Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
3357                                  /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
3358                                  /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
3359
3360    static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
3361 }
3362
3363 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
3364                                    unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
3365                                    bool allow_combining = true, bool reorder = true, bool slc = false)
3366 {
3367    Builder bld(ctx->program, ctx->block);
3368    assert(elem_size_bytes == 4 || elem_size_bytes == 8);
3369    assert(write_mask);
3370
3371    if (elem_size_bytes == 8) {
3372       elem_size_bytes = 4;
3373       write_mask = widen_mask(write_mask, 2);
3374    }
3375
3376    while (write_mask) {
3377       int start = 0;
3378       int count = 0;
3379       u_bit_scan_consecutive_range(&write_mask, &start, &count);
3380       assert(count > 0);
3381       assert(start >= 0);
3382
3383       while (count > 0) {
3384          unsigned sub_count = allow_combining ? MIN2(count, 4) : 1;
3385          unsigned const_offset = (unsigned) start * elem_size_bytes + base_const_offset;
3386
3387          /* GFX6 doesn't have buffer_store_dwordx3, so make sure not to emit that here either. */
3388          if (unlikely(ctx->program->chip_class == GFX6 && sub_count == 3))
3389             sub_count = 2;
3390
3391          Temp elem = extract_subvector(ctx, src, start, sub_count, RegType::vgpr);
3392          emit_single_mubuf_store(ctx, descriptor, voffset, soffset, elem, const_offset, reorder, slc);
3393
3394          count -= sub_count;
3395          start += sub_count;
3396       }
3397
3398       assert(count == 0);
3399    }
3400 }
3401
3402 Temp emit_single_mubuf_load(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset,
3403                             unsigned const_offset, unsigned size_dwords, bool allow_reorder = true)
3404 {
3405    assert(size_dwords != 3 || ctx->program->chip_class != GFX6);
3406    assert(size_dwords >= 1 && size_dwords <= 4);
3407
3408    Builder bld(ctx->program, ctx->block);
3409    Temp vdata = bld.tmp(RegClass(RegType::vgpr, size_dwords));
3410    aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_load_dword + size_dwords - 1);
3411    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
3412
3413    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
3414    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
3415    Builder::Result r = bld.mubuf(op, Definition(vdata), Operand(descriptor), voffset_op, soffset_op, const_offset,
3416                                  /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
3417                                  /* disable_wqm */ false, /* glc */ true,
3418                                  /* dlc*/ ctx->program->chip_class >= GFX10, /* slc */ false);
3419
3420    static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
3421
3422    return vdata;
3423 }
3424
3425 void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
3426                      unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
3427                      unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
3428 {
3429    assert(elem_size_bytes == 4 || elem_size_bytes == 8);
3430    assert((num_components * elem_size_bytes / 4) == dst.size());
3431    assert(!!stride != allow_combining);
3432
3433    Builder bld(ctx->program, ctx->block);
3434    unsigned split_cnt = num_components;
3435
3436    if (elem_size_bytes == 8) {
3437       elem_size_bytes = 4;
3438       num_components *= 2;
3439    }
3440
3441    if (!stride)
3442       stride = elem_size_bytes;
3443
3444    unsigned load_size = 1;
3445    if (allow_combining) {
3446       if ((num_components % 4) == 0)
3447          load_size = 4;
3448       else if ((num_components % 3) == 0 && ctx->program->chip_class != GFX6)
3449          load_size = 3;
3450       else if ((num_components % 2) == 0)
3451          load_size = 2;
3452    }
3453
3454    unsigned num_loads = num_components / load_size;
3455    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
3456
3457    for (unsigned i = 0; i < num_loads; ++i) {
3458       unsigned const_offset = i * stride * load_size + base_const_offset;
3459       elems[i] = emit_single_mubuf_load(ctx, descriptor, voffset, soffset, const_offset, load_size, allow_reorder);
3460    }
3461
3462    create_vec_from_array(ctx, elems.data(), num_loads, RegType::vgpr, load_size * 4u, split_cnt, dst);
3463 }
3464
3465 std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
3466 {
3467    Builder bld(ctx->program, ctx->block);
3468    Temp offset = base_offset.first;
3469    unsigned const_offset = base_offset.second;
3470
3471    if (!nir_src_is_const(*off_src)) {
3472       Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
3473       Temp with_stride;
3474
3475       /* Calculate indirect offset with stride */
3476       if (likely(indirect_offset_arg.regClass() == v1))
3477          with_stride = bld.v_mul_imm(bld.def(v1), indirect_offset_arg, stride);
3478       else if (indirect_offset_arg.regClass() == s1)
3479          with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
3480       else
3481          unreachable("Unsupported register class of indirect offset");
3482
3483       /* Add to the supplied base offset */
3484       if (offset.id() == 0)
3485          offset = with_stride;
3486       else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
3487          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
3488       else if (offset.size() == 1 && with_stride.size() == 1)
3489          offset = bld.vadd32(bld.def(v1), with_stride, offset);
3490       else
3491          unreachable("Unsupported register class of indirect offset");
3492    } else {
3493       unsigned const_offset_arg = nir_src_as_uint(*off_src);
3494       const_offset += const_offset_arg * stride;
3495    }
3496
3497    return std::make_pair(offset, const_offset);
3498 }
3499
3500 std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
3501 {
3502    Builder bld(ctx->program, ctx->block);
3503    Temp offset;
3504
3505    if (off1.first.id() && off2.first.id()) {
3506       if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
3507          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
3508       else if (off1.first.size() == 1 && off2.first.size() == 1)
3509          offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
3510       else
3511          unreachable("Unsupported register class of indirect offset");
3512    } else {
3513       offset = off1.first.id() ? off1.first : off2.first;
3514    }
3515
3516    return std::make_pair(offset, off1.second + off2.second);
3517 }
3518
3519 std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
3520 {
3521    Builder bld(ctx->program, ctx->block);
3522    unsigned const_offset = offs.second * multiplier;
3523
3524    if (!offs.first.id())
3525       return std::make_pair(offs.first, const_offset);
3526
3527    Temp offset = unlikely(offs.first.regClass() == s1)
3528                  ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
3529                  : bld.v_mul_imm(bld.def(v1), offs.first, multiplier);
3530
3531    return std::make_pair(offset, const_offset);
3532 }
3533
3534 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
3535 {
3536    Builder bld(ctx->program, ctx->block);
3537
3538    /* base is the driver_location, which is already multiplied by 4, so is in dwords */
3539    unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
3540    /* component is in bytes */
3541    const_offset += nir_intrinsic_component(instr) * component_stride;
3542
3543    /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
3544    nir_src *off_src = nir_get_io_offset_src(instr);
3545    return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
3546 }
3547
3548 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
3549 {
3550    return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
3551 }
3552
3553 Temp get_tess_rel_patch_id(isel_context *ctx)
3554 {
3555    Builder bld(ctx->program, ctx->block);
3556
3557    switch (ctx->shader->info.stage) {
3558    case MESA_SHADER_TESS_CTRL:
3559       return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
3560                       get_arg(ctx, ctx->args->ac.tcs_rel_ids));
3561    case MESA_SHADER_TESS_EVAL:
3562       return get_arg(ctx, ctx->args->tes_rel_patch_id);
3563    default:
3564       unreachable("Unsupported stage in get_tess_rel_patch_id");
3565    }
3566 }
3567
3568 std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
3569 {
3570    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3571    Builder bld(ctx->program, ctx->block);
3572
3573    uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
3574    uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
3575
3576    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
3577
3578    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3579    offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
3580
3581    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3582    Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
3583    offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
3584
3585    return offset_mul(ctx, offs, 4u);
3586 }
3587
3588 std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
3589 {
3590    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3591    Builder bld(ctx->program, ctx->block);
3592
3593    uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
3594    uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
3595    uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
3596    uint32_t output_vertex_size = num_tcs_outputs * 16;
3597    uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
3598    uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
3599
3600    std::pair<Temp, unsigned> offs = instr
3601                                     ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
3602                                     : std::make_pair(Temp(), 0u);
3603
3604    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3605    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
3606
3607    if (per_vertex) {
3608       assert(instr);
3609
3610       nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3611       offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
3612
3613       uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
3614       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
3615    } else {
3616       uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
3617       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
3618    }
3619
3620    return offs;
3621 }
3622
3623 std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
3624 {
3625    Builder bld(ctx->program, ctx->block);
3626
3627    unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
3628    unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
3629
3630    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
3631
3632    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3633    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
3634    offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
3635
3636    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3637    offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
3638
3639    return offs;
3640 }
3641
3642 std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
3643 {
3644    Builder bld(ctx->program, ctx->block);
3645
3646    unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
3647                               ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
3648                               : ctx->args->options->key.tes.tcs_num_outputs;
3649
3650    unsigned output_vertex_size = num_tcs_outputs * 16;
3651    unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
3652    unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
3653    unsigned attr_stride = ctx->tcs_num_patches;
3654
3655    std::pair<Temp, unsigned> offs = instr
3656                                     ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
3657                                     : std::make_pair(Temp(), 0u);
3658
3659    if (const_base_offset)
3660       offs.second += const_base_offset * attr_stride;
3661
3662    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
3663    Temp patch_off = bld.v_mul_imm(bld.def(v1), rel_patch_id, 16u);
3664    offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
3665
3666    return offs;
3667 }
3668
3669 bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
3670 {
3671    unsigned off = nir_intrinsic_base(instr) * 4u;
3672    nir_src *off_src = nir_get_io_offset_src(instr);
3673
3674    if (!nir_src_is_const(*off_src)) {
3675       *indirect = true;
3676       return false;
3677    }
3678
3679    *indirect = false;
3680    off += nir_src_as_uint(*off_src) * 16u;
3681
3682    while (mask) {
3683       unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0);
3684       if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u)
3685          return true;
3686    }
3687
3688    return false;
3689 }
3690
3691 bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
3692 {
3693    unsigned write_mask = nir_intrinsic_write_mask(instr);
3694    unsigned component = nir_intrinsic_component(instr);
3695    unsigned idx = nir_intrinsic_base(instr) + component;
3696
3697    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
3698    if (off_instr->type != nir_instr_type_load_const)
3699       return false;
3700
3701    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
3702    idx += nir_src_as_uint(instr->src[1]) * 4u;
3703
3704    if (instr->src[0].ssa->bit_size == 64)
3705       write_mask = widen_mask(write_mask, 2);
3706
3707    for (unsigned i = 0; i < 8; ++i) {
3708       if (write_mask & (1 << i)) {
3709          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
3710          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
3711       }
3712       idx++;
3713    }
3714
3715    return true;
3716 }
3717
3718 bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
3719 {
3720    /* Only TCS per-vertex inputs are supported by this function.
3721     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
3722     */
3723    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
3724       return false;
3725
3726    nir_src *off_src = nir_get_io_offset_src(instr);
3727    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
3728    nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
3729    bool can_use_temps = nir_src_is_const(*off_src) &&
3730                         vertex_index_instr->type == nir_instr_type_intrinsic &&
3731                         nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
3732
3733    if (!can_use_temps)
3734       return false;
3735
3736    unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
3737    Temp *src = &ctx->inputs.temps[idx];
3738    Temp vec = create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u);
3739    assert(vec.size() == dst.size());
3740
3741    Builder bld(ctx->program, ctx->block);
3742    bld.copy(Definition(dst), vec);
3743    return true;
3744 }
3745
3746 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
3747 {
3748    Builder bld(ctx->program, ctx->block);
3749
3750    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
3751    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
3752    unsigned write_mask = nir_intrinsic_write_mask(instr);
3753    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
3754
3755    if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
3756       /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
3757       bool indirect_write;
3758       bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
3759       if (temp_only_input && !indirect_write)
3760          return;
3761    }
3762
3763    if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
3764       /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
3765       Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
3766       Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
3767       store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
3768    } else {
3769       Temp lds_base;
3770
3771       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
3772          /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
3773          unsigned itemsize = ctx->stage == vertex_geometry_gs
3774                              ? ctx->program->info->vs.es_info.esgs_itemsize
3775                              : ctx->program->info->tes.es_info.esgs_itemsize;
3776          Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
3777          Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
3778          Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
3779                                bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
3780          lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
3781       } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
3782          /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
3783           * GFX9+: LS is merged into HS, but still uses the same LDS layout.
3784           */
3785          unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
3786          Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
3787          lds_base = bld.v_mul_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
3788       } else {
3789          unreachable("Invalid LS or ES stage");
3790       }
3791
3792       offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
3793       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
3794       store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
3795    }
3796 }
3797
3798 bool should_write_tcs_patch_output_to_vmem(isel_context *ctx, nir_intrinsic_instr *instr)
3799 {
3800    unsigned off = nir_intrinsic_base(instr) * 4u;
3801    return off != ctx->tcs_tess_lvl_out_loc &&
3802           off != ctx->tcs_tess_lvl_in_loc;
3803 }
3804
3805 bool should_write_tcs_output_to_lds(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
3806 {
3807    /* When none of the appropriate outputs are read, we are OK to never write to LDS */
3808    if (per_vertex ? ctx->shader->info.outputs_read == 0U : ctx->shader->info.patch_outputs_read == 0u)
3809       return false;
3810
3811    uint64_t mask = per_vertex
3812                    ? ctx->shader->info.outputs_read
3813                    : ctx->shader->info.patch_outputs_read;
3814    bool indirect_write;
3815    bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
3816    return indirect_write || output_read;
3817 }
3818
3819 void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
3820 {
3821    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
3822    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3823
3824    Builder bld(ctx->program, ctx->block);
3825
3826    Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
3827    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
3828    unsigned write_mask = nir_intrinsic_write_mask(instr);
3829
3830    /* Only write to VMEM if the output is per-vertex or it's per-patch non tess factor */
3831    bool write_to_vmem = per_vertex || should_write_tcs_patch_output_to_vmem(ctx, instr);
3832    /* Only write to LDS if the output is read by the shader, or it's per-patch tess factor */
3833    bool write_to_lds = !write_to_vmem || should_write_tcs_output_to_lds(ctx, instr, per_vertex);
3834
3835    if (write_to_vmem) {
3836       std::pair<Temp, unsigned> vmem_offs = per_vertex
3837                                             ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
3838                                             : get_tcs_per_patch_output_vmem_offset(ctx, instr);
3839
3840       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
3841       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
3842       store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
3843    }
3844
3845    if (write_to_lds) {
3846       std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
3847       unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
3848       store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
3849    }
3850 }
3851
3852 void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
3853 {
3854    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
3855    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
3856
3857    Builder bld(ctx->program, ctx->block);
3858
3859    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3860    std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
3861    unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
3862    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
3863
3864    load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
3865 }
3866
3867 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
3868 {
3869    if (ctx->stage == vertex_vs ||
3870        ctx->stage == tess_eval_vs ||
3871        ctx->stage == fragment_fs ||
3872        ctx->stage == ngg_vertex_gs ||
3873        ctx->stage == ngg_tess_eval_gs ||
3874        ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
3875       bool stored_to_temps = store_output_to_temps(ctx, instr);
3876       if (!stored_to_temps) {
3877          fprintf(stderr, "Unimplemented output offset instruction:\n");
3878          nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
3879          fprintf(stderr, "\n");
3880          abort();
3881       }
3882    } else if (ctx->stage == vertex_es ||
3883               ctx->stage == vertex_ls ||
3884               ctx->stage == tess_eval_es ||
3885               (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
3886               (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
3887               (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) {
3888       visit_store_ls_or_es_output(ctx, instr);
3889    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
3890       visit_store_tcs_output(ctx, instr, false);
3891    } else {
3892       unreachable("Shader stage not implemented");
3893    }
3894 }
3895
3896 void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
3897 {
3898    visit_load_tcs_output(ctx, instr, false);
3899 }
3900
3901 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
3902 {
3903    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
3904    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
3905
3906    Builder bld(ctx->program, ctx->block);
3907    Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
3908    if (ctx->program->has_16bank_lds)
3909       interp_p1.instr->operands[0].setLateKill(true);
3910    bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component);
3911 }
3912
3913 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
3914 {
3915    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
3916    for (unsigned i = 0; i < num_components; i++)
3917       vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
3918    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
3919       assert(num_components == 4);
3920       Builder bld(ctx->program, ctx->block);
3921       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
3922    }
3923
3924    for (Operand& op : vec->operands)
3925       op = op.isUndefined() ? Operand(0u) : op;
3926
3927    vec->definitions[0] = Definition(dst);
3928    ctx->block->instructions.emplace_back(std::move(vec));
3929    emit_split_vector(ctx, dst, num_components);
3930    return;
3931 }
3932
3933 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
3934 {
3935    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3936    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
3937    unsigned idx = nir_intrinsic_base(instr);
3938    unsigned component = nir_intrinsic_component(instr);
3939    Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
3940
3941    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
3942    if (offset) {
3943       assert(offset->u32 == 0);
3944    } else {
3945       /* the lower 15bit of the prim_mask contain the offset into LDS
3946        * while the upper bits contain the number of prims */
3947       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
3948       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3949       Builder bld(ctx->program, ctx->block);
3950       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3951       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3952       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3953       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3954       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3955    }
3956
3957    if (instr->dest.ssa.num_components == 1) {
3958       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
3959    } else {
3960       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
3961       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
3962       {
3963          Temp tmp = {ctx->program->allocateId(), v1};
3964          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
3965          vec->operands[i] = Operand(tmp);
3966       }
3967       vec->definitions[0] = Definition(dst);
3968       ctx->block->instructions.emplace_back(std::move(vec));
3969    }
3970 }
3971
3972 bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
3973                              unsigned offset, unsigned stride, unsigned channels)
3974 {
3975    unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
3976    if (vtx_info->chan_byte_size != 4 && channels == 3)
3977       return false;
3978    return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
3979           (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
3980 }
3981
3982 uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
3983                               unsigned offset, unsigned stride, unsigned *channels)
3984 {
3985    if (!vtx_info->chan_byte_size) {
3986       *channels = vtx_info->num_channels;
3987       return vtx_info->chan_format;
3988    }
3989
3990    unsigned num_channels = *channels;
3991    if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
3992       unsigned new_channels = num_channels + 1;
3993       /* first, assume more loads is worse and try using a larger data format */
3994       while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
3995          new_channels++;
3996          /* don't make the attribute potentially out-of-bounds */
3997          if (offset + new_channels * vtx_info->chan_byte_size > stride)
3998             new_channels = 5;
3999       }
4000
4001       if (new_channels == 5) {
4002          /* then try decreasing load size (at the cost of more loads) */
4003          new_channels = *channels;
4004          while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
4005             new_channels--;
4006       }
4007
4008       if (new_channels < *channels)
4009          *channels = new_channels;
4010       num_channels = new_channels;
4011    }
4012
4013    switch (vtx_info->chan_format) {
4014    case V_008F0C_BUF_DATA_FORMAT_8:
4015       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4016                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4017    case V_008F0C_BUF_DATA_FORMAT_16:
4018       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4019                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4020    case V_008F0C_BUF_DATA_FORMAT_32:
4021       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4022                          V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4023    }
4024    unreachable("shouldn't reach here");
4025    return V_008F0C_BUF_DATA_FORMAT_INVALID;
4026 }
4027
4028 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4029  * so we may need to fix it up. */
4030 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
4031 {
4032    Builder bld(ctx->program, ctx->block);
4033
4034    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
4035       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4036
4037    /* For the integer-like cases, do a natural sign extension.
4038     *
4039     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4040     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4041     * exponent.
4042     */
4043    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
4044    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
4045
4046    /* Convert back to the right type. */
4047    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
4048       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4049       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
4050       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
4051    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
4052       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4053    }
4054
4055    return alpha;
4056 }
4057
4058 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
4059 {
4060    Builder bld(ctx->program, ctx->block);
4061    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4062    if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
4063
4064       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
4065       if (off_instr->type != nir_instr_type_load_const) {
4066          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4067          nir_print_instr(off_instr, stderr);
4068          fprintf(stderr, "\n");
4069       }
4070       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
4071
4072       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
4073
4074       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
4075       unsigned component = nir_intrinsic_component(instr);
4076       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
4077       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
4078       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
4079       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
4080
4081       unsigned dfmt = attrib_format & 0xf;
4082       unsigned nfmt = (attrib_format >> 4) & 0x7;
4083       const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
4084
4085       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
4086       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4087       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
4088       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
4089       if (post_shuffle)
4090          num_channels = MAX2(num_channels, 3);
4091
4092       Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u));
4093       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
4094
4095       Temp index;
4096       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
4097          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
4098          Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
4099          if (divisor) {
4100             Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
4101             if (divisor != 1) {
4102                Temp divided = bld.tmp(v1);
4103                emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
4104                index = bld.vadd32(bld.def(v1), start_instance, divided);
4105             } else {
4106                index = bld.vadd32(bld.def(v1), start_instance, instance_id);
4107             }
4108          } else {
4109             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
4110          }
4111       } else {
4112          index = bld.vadd32(bld.def(v1),
4113                             get_arg(ctx, ctx->args->ac.base_vertex),
4114                             get_arg(ctx, ctx->args->ac.vertex_id));
4115       }
4116
4117       Temp channels[num_channels];
4118       unsigned channel_start = 0;
4119       bool direct_fetch = false;
4120
4121       /* skip unused channels at the start */
4122       if (vtx_info->chan_byte_size && !post_shuffle) {
4123          channel_start = ffs(mask) - 1;
4124          for (unsigned i = 0; i < channel_start; i++)
4125             channels[i] = Temp(0, s1);
4126       } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
4127          num_channels = 3 - (ffs(mask) - 1);
4128       }
4129
4130       /* load channels */
4131       while (channel_start < num_channels) {
4132          unsigned fetch_size = num_channels - channel_start;
4133          unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
4134          bool expanded = false;
4135
4136          /* use MUBUF when possible to avoid possible alignment issues */
4137          /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
4138          bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
4139                            nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
4140                            nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
4141                           vtx_info->chan_byte_size == 4;
4142          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
4143          if (!use_mubuf) {
4144             fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
4145          } else {
4146             if (fetch_size == 3 && ctx->options->chip_class == GFX6) {
4147                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
4148                fetch_size = 4;
4149                expanded = true;
4150             }
4151          }
4152
4153          Temp fetch_index = index;
4154          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
4155             fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
4156             fetch_offset = fetch_offset % attrib_stride;
4157          }
4158
4159          Operand soffset(0u);
4160          if (fetch_offset >= 4096) {
4161             soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
4162             fetch_offset %= 4096;
4163          }
4164
4165          aco_opcode opcode;
4166          switch (fetch_size) {
4167          case 1:
4168             opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
4169             break;
4170          case 2:
4171             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
4172             break;
4173          case 3:
4174             assert(ctx->options->chip_class >= GFX7 ||
4175                    (!use_mubuf && ctx->options->chip_class == GFX6));
4176             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
4177             break;
4178          case 4:
4179             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
4180             break;
4181          default:
4182             unreachable("Unimplemented load_input vector size");
4183          }
4184
4185          Temp fetch_dst;
4186          if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
4187              !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
4188                            num_channels <= 3)) {
4189             direct_fetch = true;
4190             fetch_dst = dst;
4191          } else {
4192             fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
4193          }
4194
4195          if (use_mubuf) {
4196             Instruction *mubuf = bld.mubuf(opcode,
4197                                            Definition(fetch_dst), list, fetch_index, soffset,
4198                                            fetch_offset, false, true).instr;
4199             static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
4200          } else {
4201             Instruction *mtbuf = bld.mtbuf(opcode,
4202                                            Definition(fetch_dst), list, fetch_index, soffset,
4203                                            fetch_dfmt, nfmt, fetch_offset, false, true).instr;
4204             static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
4205          }
4206
4207          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
4208
4209          if (fetch_size == 1) {
4210             channels[channel_start] = fetch_dst;
4211          } else {
4212             for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
4213                channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
4214          }
4215
4216          channel_start += fetch_size;
4217       }
4218
4219       if (!direct_fetch) {
4220          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
4221                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
4222
4223          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
4224          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
4225          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
4226
4227          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4228          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4229          unsigned num_temp = 0;
4230          for (unsigned i = 0; i < dst.size(); i++) {
4231             unsigned idx = i + component;
4232             if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
4233                Temp channel = channels[swizzle[idx]];
4234                if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
4235                   channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
4236                vec->operands[i] = Operand(channel);
4237
4238                num_temp++;
4239                elems[i] = channel;
4240             } else if (is_float && idx == 3) {
4241                vec->operands[i] = Operand(0x3f800000u);
4242             } else if (!is_float && idx == 3) {
4243                vec->operands[i] = Operand(1u);
4244             } else {
4245                vec->operands[i] = Operand(0u);
4246             }
4247          }
4248          vec->definitions[0] = Definition(dst);
4249          ctx->block->instructions.emplace_back(std::move(vec));
4250          emit_split_vector(ctx, dst, dst.size());
4251
4252          if (num_temp == dst.size())
4253             ctx->allocated_vec.emplace(dst.id(), elems);
4254       }
4255    } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
4256       unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
4257       nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
4258       if (off_instr->type != nir_instr_type_load_const ||
4259           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
4260          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4261          nir_print_instr(off_instr, stderr);
4262          fprintf(stderr, "\n");
4263       }
4264
4265       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4266       nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]);
4267       if (offset) {
4268          assert(offset->u32 == 0);
4269       } else {
4270          /* the lower 15bit of the prim_mask contain the offset into LDS
4271           * while the upper bits contain the number of prims */
4272          Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa);
4273          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4274          Builder bld(ctx->program, ctx->block);
4275          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4276          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4277          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4278          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4279          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4280       }
4281
4282       unsigned idx = nir_intrinsic_base(instr);
4283       unsigned component = nir_intrinsic_component(instr);
4284       unsigned vertex_id = 2; /* P0 */
4285
4286       if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
4287          nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
4288          switch (src0->u32) {
4289          case 0:
4290             vertex_id = 2; /* P0 */
4291             break;
4292          case 1:
4293             vertex_id = 0; /* P10 */
4294             break;
4295          case 2:
4296             vertex_id = 1; /* P20 */
4297             break;
4298          default:
4299             unreachable("invalid vertex index");
4300          }
4301       }
4302
4303       if (dst.size() == 1) {
4304          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
4305       } else {
4306          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4307          for (unsigned i = 0; i < dst.size(); i++)
4308             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
4309          vec->definitions[0] = Definition(dst);
4310          bld.insert(std::move(vec));
4311       }
4312
4313    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_EVAL) {
4314       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4315       Temp soffset = get_arg(ctx, ctx->args->oc_lds);
4316       std::pair<Temp, unsigned> offs = get_tcs_per_patch_output_vmem_offset(ctx, instr);
4317       unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8u;
4318
4319       load_vmem_mubuf(ctx, dst, ring, offs.first, soffset, offs.second, elem_size_bytes, instr->dest.ssa.num_components);
4320    } else {
4321       unreachable("Shader stage not implemented");
4322    }
4323 }
4324
4325 std::pair<Temp, unsigned> get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u)
4326 {
4327    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
4328
4329    Builder bld(ctx->program, ctx->block);
4330    nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
4331    Temp vertex_offset;
4332
4333    if (!nir_src_is_const(*vertex_src)) {
4334       /* better code could be created, but this case probably doesn't happen
4335        * much in practice */
4336       Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa));
4337       for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
4338          Temp elem;
4339
4340          if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4341             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
4342             if (i % 2u)
4343                elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
4344          } else {
4345             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]);
4346          }
4347
4348          if (vertex_offset.id()) {
4349             Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)),
4350                                  Operand(i), indirect_vertex);
4351             vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond);
4352          } else {
4353             vertex_offset = elem;
4354          }
4355       }
4356
4357       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
4358          vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset);
4359    } else {
4360       unsigned vertex = nir_src_as_uint(*vertex_src);
4361       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
4362          vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
4363                                   get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
4364                                   Operand((vertex % 2u) * 16u), Operand(16u));
4365       else
4366          vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]);
4367    }
4368
4369    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride);
4370    offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u));
4371    return offset_mul(ctx, offs, 4u);
4372 }
4373
4374 void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
4375 {
4376    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
4377
4378    Builder bld(ctx->program, ctx->block);
4379    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4380    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4381
4382    if (ctx->stage == geometry_gs) {
4383       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size);
4384       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u));
4385       load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true);
4386    } else if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4387       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr);
4388       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
4389       load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
4390    } else {
4391       unreachable("Unsupported GS stage.");
4392    }
4393 }
4394
4395 void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
4396 {
4397    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4398
4399    Builder bld(ctx->program, ctx->block);
4400    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4401
4402    if (load_input_from_temps(ctx, instr, dst))
4403       return;
4404
4405    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
4406    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4407    unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
4408
4409    load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
4410 }
4411
4412 void visit_load_tes_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
4413 {
4414    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
4415
4416    Builder bld(ctx->program, ctx->block);
4417
4418    Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4419    Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
4420    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4421
4422    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4423    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_output_vmem_offset(ctx, instr);
4424
4425    load_vmem_mubuf(ctx, dst, ring, offs.first, oc_lds, offs.second, elem_size_bytes, instr->dest.ssa.num_components, 0u, true, true);
4426 }
4427
4428 void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
4429 {
4430    switch (ctx->shader->info.stage) {
4431    case MESA_SHADER_GEOMETRY:
4432       visit_load_gs_per_vertex_input(ctx, instr);
4433       break;
4434    case MESA_SHADER_TESS_CTRL:
4435       visit_load_tcs_per_vertex_input(ctx, instr);
4436       break;
4437    case MESA_SHADER_TESS_EVAL:
4438       visit_load_tes_per_vertex_input(ctx, instr);
4439       break;
4440    default:
4441       unreachable("Unimplemented shader stage");
4442    }
4443 }
4444
4445 void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
4446 {
4447    visit_load_tcs_output(ctx, instr, true);
4448 }
4449
4450 void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
4451 {
4452    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4453    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4454
4455    visit_store_tcs_output(ctx, instr, true);
4456 }
4457
4458 void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
4459 {
4460    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
4461
4462    Builder bld(ctx->program, ctx->block);
4463    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4464
4465    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
4466    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
4467    Operand tes_w(0u);
4468
4469    if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
4470       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
4471       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp);
4472       tes_w = Operand(tmp);
4473    }
4474
4475    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
4476    emit_split_vector(ctx, tess_coord, 3);
4477 }
4478
4479 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
4480 {
4481    if (ctx->program->info->need_indirect_descriptor_sets) {
4482       Builder bld(ctx->program, ctx->block);
4483       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
4484       Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
4485       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
4486    }
4487
4488    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
4489 }
4490
4491
4492 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
4493 {
4494    Builder bld(ctx->program, ctx->block);
4495    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
4496    if (!ctx->divergent_vals[instr->dest.ssa.index])
4497       index = bld.as_uniform(index);
4498    unsigned desc_set = nir_intrinsic_desc_set(instr);
4499    unsigned binding = nir_intrinsic_binding(instr);
4500
4501    Temp desc_ptr;
4502    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
4503    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
4504    unsigned offset = layout->binding[binding].offset;
4505    unsigned stride;
4506    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
4507        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
4508       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
4509       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
4510       offset = pipeline_layout->push_constant_size + 16 * idx;
4511       stride = 16;
4512    } else {
4513       desc_ptr = load_desc_ptr(ctx, desc_set);
4514       stride = layout->binding[binding].size;
4515    }
4516
4517    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
4518    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
4519    if (stride != 1) {
4520       if (nir_const_index) {
4521          const_index = const_index * stride;
4522       } else if (index.type() == RegType::vgpr) {
4523          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
4524          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
4525       } else {
4526          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
4527       }
4528    }
4529    if (offset) {
4530       if (nir_const_index) {
4531          const_index = const_index + offset;
4532       } else if (index.type() == RegType::vgpr) {
4533          index = bld.vadd32(bld.def(v1), Operand(offset), index);
4534       } else {
4535          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
4536       }
4537    }
4538
4539    if (nir_const_index && const_index == 0) {
4540       index = desc_ptr;
4541    } else if (index.type() == RegType::vgpr) {
4542       index = bld.vadd32(bld.def(v1),
4543                          nir_const_index ? Operand(const_index) : Operand(index),
4544                          Operand(desc_ptr));
4545    } else {
4546       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4547                        nir_const_index ? Operand(const_index) : Operand(index),
4548                        Operand(desc_ptr));
4549    }
4550
4551    bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
4552 }
4553
4554 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
4555                  Temp dst, Temp rsrc, Temp offset, int byte_align,
4556                  bool glc=false, bool readonly=true)
4557 {
4558    Builder bld(ctx->program, ctx->block);
4559    bool dlc = glc && ctx->options->chip_class >= GFX10;
4560    unsigned num_bytes = num_components * component_size;
4561
4562    aco_opcode op;
4563    if (dst.type() == RegType::vgpr || ((ctx->options->chip_class < GFX8 || component_size < 4) && !readonly)) {
4564       Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4565       Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4566       unsigned const_offset = 0;
4567
4568       /* for small bit sizes add buffer for unaligned loads */
4569       if (byte_align) {
4570          if (num_bytes > 2)
4571             num_bytes += byte_align == -1 ? 4 - component_size : byte_align;
4572          else
4573             byte_align = 0;
4574       }
4575
4576       Temp lower = Temp();
4577       if (num_bytes > 16) {
4578          assert(num_components == 3 || num_components == 4);
4579          op = aco_opcode::buffer_load_dwordx4;
4580          lower = bld.tmp(v4);
4581          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4582          mubuf->definitions[0] = Definition(lower);
4583          mubuf->operands[0] = Operand(rsrc);
4584          mubuf->operands[1] = vaddr;
4585          mubuf->operands[2] = soffset;
4586          mubuf->offen = (offset.type() == RegType::vgpr);
4587          mubuf->glc = glc;
4588          mubuf->dlc = dlc;
4589          mubuf->barrier = readonly ? barrier_none : barrier_buffer;
4590          mubuf->can_reorder = readonly;
4591          bld.insert(std::move(mubuf));
4592          emit_split_vector(ctx, lower, 2);
4593          num_bytes -= 16;
4594          const_offset = 16;
4595       } else if (num_bytes == 12 && ctx->options->chip_class == GFX6) {
4596          /* GFX6 doesn't support loading vec3, expand to vec4. */
4597          num_bytes = 16;
4598       }
4599
4600       switch (num_bytes) {
4601          case 1:
4602             op = aco_opcode::buffer_load_ubyte;
4603             break;
4604          case 2:
4605             op = aco_opcode::buffer_load_ushort;
4606             break;
4607          case 3:
4608          case 4:
4609             op = aco_opcode::buffer_load_dword;
4610             break;
4611          case 5:
4612          case 6:
4613          case 7:
4614          case 8:
4615             op = aco_opcode::buffer_load_dwordx2;
4616             break;
4617          case 10:
4618          case 12:
4619             assert(ctx->options->chip_class > GFX6);
4620             op = aco_opcode::buffer_load_dwordx3;
4621             break;
4622          case 16:
4623             op = aco_opcode::buffer_load_dwordx4;
4624             break;
4625          default:
4626             unreachable("Load SSBO not implemented for this size.");
4627       }
4628       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
4629       mubuf->operands[0] = Operand(rsrc);
4630       mubuf->operands[1] = vaddr;
4631       mubuf->operands[2] = soffset;
4632       mubuf->offen = (offset.type() == RegType::vgpr);
4633       mubuf->glc = glc;
4634       mubuf->dlc = dlc;
4635       mubuf->barrier = readonly ? barrier_none : barrier_buffer;
4636       mubuf->can_reorder = readonly;
4637       mubuf->offset = const_offset;
4638       aco_ptr<Instruction> instr = std::move(mubuf);
4639
4640       if (component_size < 4) {
4641          Temp vec = num_bytes <= 4 ? bld.tmp(v1) : num_bytes <= 8 ? bld.tmp(v2) : bld.tmp(v3);
4642          instr->definitions[0] = Definition(vec);
4643          bld.insert(std::move(instr));
4644
4645          if (byte_align == -1 || (byte_align && dst.type() == RegType::sgpr)) {
4646             Operand align = byte_align == -1 ? Operand(offset) : Operand((uint32_t)byte_align);
4647             Temp tmp[3] = {vec, vec, vec};
4648
4649             if (vec.size() == 3) {
4650                tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
4651                bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
4652             } else if (vec.size() == 2) {
4653                tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
4654                bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
4655             }
4656             for (unsigned i = 0; i < dst.size(); i++)
4657                tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], align);
4658
4659             vec = tmp[0];
4660             if (dst.size() == 2)
4661                vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
4662
4663             byte_align = 0;
4664          }
4665
4666          if (dst.type() == RegType::vgpr && num_components == 1) {
4667             bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), vec, Operand(byte_align / component_size));
4668          } else {
4669             trim_subdword_vector(ctx, vec, dst, 4 * vec.size() / component_size, ((1 << num_components) - 1) << byte_align / component_size);
4670          }
4671
4672          return;
4673
4674       } else if (dst.size() > 4) {
4675          assert(lower != Temp());
4676          Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
4677          instr->definitions[0] = Definition(upper);
4678          bld.insert(std::move(instr));
4679          if (dst.size() == 8)
4680             emit_split_vector(ctx, upper, 2);
4681          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
4682          instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
4683          instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
4684          instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
4685          if (dst.size() == 8)
4686             instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
4687       } else if (dst.size() == 3 && ctx->options->chip_class == GFX6) {
4688          Temp vec = bld.tmp(v4);
4689          instr->definitions[0] = Definition(vec);
4690          bld.insert(std::move(instr));
4691          emit_split_vector(ctx, vec, 4);
4692
4693          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1));
4694          instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1));
4695          instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1));
4696          instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1));
4697       }
4698
4699       if (dst.type() == RegType::sgpr) {
4700          Temp vec = bld.tmp(RegType::vgpr, dst.size());
4701          instr->definitions[0] = Definition(vec);
4702          bld.insert(std::move(instr));
4703          expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
4704       } else {
4705          instr->definitions[0] = Definition(dst);
4706          bld.insert(std::move(instr));
4707          emit_split_vector(ctx, dst, num_components);
4708       }
4709    } else {
4710       /* for small bit sizes add buffer for unaligned loads */
4711       if (byte_align)
4712          num_bytes += byte_align == -1 ? 4 - component_size : byte_align;
4713
4714       switch (num_bytes) {
4715          case 1:
4716          case 2:
4717          case 3:
4718          case 4:
4719             op = aco_opcode::s_buffer_load_dword;
4720             break;
4721          case 5:
4722          case 6:
4723          case 7:
4724          case 8:
4725             op = aco_opcode::s_buffer_load_dwordx2;
4726             break;
4727          case 10:
4728          case 12:
4729          case 16:
4730             op = aco_opcode::s_buffer_load_dwordx4;
4731             break;
4732          case 24:
4733          case 32:
4734             op = aco_opcode::s_buffer_load_dwordx8;
4735             break;
4736          default:
4737             unreachable("Load SSBO not implemented for this size.");
4738       }
4739       offset = bld.as_uniform(offset);
4740       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4741       load->operands[0] = Operand(rsrc);
4742       load->operands[1] = Operand(offset);
4743       assert(load->operands[1].getTemp().type() == RegType::sgpr);
4744       load->definitions[0] = Definition(dst);
4745       load->glc = glc;
4746       load->dlc = dlc;
4747       load->barrier = readonly ? barrier_none : barrier_buffer;
4748       load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
4749       assert(ctx->options->chip_class >= GFX8 || !glc);
4750
4751       /* adjust misaligned small bit size loads */
4752       if (byte_align) {
4753          Temp vec = num_bytes <= 4 ? bld.tmp(s1) : num_bytes <= 8 ? bld.tmp(s2) : bld.tmp(s4);
4754          load->definitions[0] = Definition(vec);
4755          bld.insert(std::move(load));
4756          Operand byte_offset = byte_align > 0 ? Operand(uint32_t(byte_align)) : Operand(offset);
4757          byte_align_scalar(ctx, vec, byte_offset, dst);
4758
4759       /* trim vector */
4760       } else if (dst.size() == 3) {
4761          Temp vec = bld.tmp(s4);
4762          load->definitions[0] = Definition(vec);
4763          bld.insert(std::move(load));
4764          emit_split_vector(ctx, vec, 4);
4765
4766          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4767                     emit_extract_vector(ctx, vec, 0, s1),
4768                     emit_extract_vector(ctx, vec, 1, s1),
4769                     emit_extract_vector(ctx, vec, 2, s1));
4770       } else if (dst.size() == 6) {
4771          Temp vec = bld.tmp(s8);
4772          load->definitions[0] = Definition(vec);
4773          bld.insert(std::move(load));
4774          emit_split_vector(ctx, vec, 4);
4775
4776          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4777                     emit_extract_vector(ctx, vec, 0, s2),
4778                     emit_extract_vector(ctx, vec, 1, s2),
4779                     emit_extract_vector(ctx, vec, 2, s2));
4780       } else {
4781          bld.insert(std::move(load));
4782       }
4783       emit_split_vector(ctx, dst, num_components);
4784    }
4785 }
4786
4787 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
4788 {
4789    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4790    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
4791
4792    Builder bld(ctx->program, ctx->block);
4793
4794    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
4795    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
4796    unsigned binding = nir_intrinsic_binding(idx_instr);
4797    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
4798
4799    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
4800       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4801                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4802                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4803                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4804       if (ctx->options->chip_class >= GFX10) {
4805          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4806                       S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
4807                       S_008F0C_RESOURCE_LEVEL(1);
4808       } else {
4809          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4810                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4811       }
4812       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
4813                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
4814                                      Operand(0xFFFFFFFFu),
4815                                      Operand(desc_type));
4816       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
4817                         rsrc, upper_dwords);
4818    } else {
4819       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
4820       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4821    }
4822    unsigned size = instr->dest.ssa.bit_size / 8;
4823    int byte_align = 0;
4824    if (size < 4) {
4825       unsigned align_mul = nir_intrinsic_align_mul(instr);
4826       unsigned align_offset = nir_intrinsic_align_offset(instr);
4827       byte_align = align_mul % 4 == 0 ? align_offset : -1;
4828    }
4829    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), byte_align);
4830 }
4831
4832 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
4833 {
4834    Builder bld(ctx->program, ctx->block);
4835    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4836    unsigned offset = nir_intrinsic_base(instr);
4837    unsigned count = instr->dest.ssa.num_components;
4838    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
4839
4840    if (index_cv && instr->dest.ssa.bit_size == 32) {
4841       unsigned start = (offset + index_cv->u32) / 4u;
4842       start -= ctx->args->ac.base_inline_push_consts;
4843       if (start + count <= ctx->args->ac.num_inline_push_consts) {
4844          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4845          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4846          for (unsigned i = 0; i < count; ++i) {
4847             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
4848             vec->operands[i] = Operand{elems[i]};
4849          }
4850          vec->definitions[0] = Definition(dst);
4851          ctx->block->instructions.emplace_back(std::move(vec));
4852          ctx->allocated_vec.emplace(dst.id(), elems);
4853          return;
4854       }
4855    }
4856
4857    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
4858    if (offset != 0) // TODO check if index != 0 as well
4859       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
4860    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
4861    Temp vec = dst;
4862    bool trim = false;
4863    bool aligned = true;
4864
4865    if (instr->dest.ssa.bit_size == 8) {
4866       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
4867       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
4868       if (!aligned)
4869          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
4870    } else if (instr->dest.ssa.bit_size == 16) {
4871       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
4872       if (!aligned)
4873          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
4874    }
4875
4876    aco_opcode op;
4877
4878    switch (vec.size()) {
4879    case 1:
4880       op = aco_opcode::s_load_dword;
4881       break;
4882    case 2:
4883       op = aco_opcode::s_load_dwordx2;
4884       break;
4885    case 3:
4886       vec = bld.tmp(s4);
4887       trim = true;
4888    case 4:
4889       op = aco_opcode::s_load_dwordx4;
4890       break;
4891    case 6:
4892       vec = bld.tmp(s8);
4893       trim = true;
4894    case 8:
4895       op = aco_opcode::s_load_dwordx8;
4896       break;
4897    default:
4898       unreachable("unimplemented or forbidden load_push_constant.");
4899    }
4900
4901    bld.smem(op, Definition(vec), ptr, index);
4902
4903    if (!aligned) {
4904       Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
4905       byte_align_scalar(ctx, vec, byte_offset, dst);
4906       return;
4907    }
4908
4909    if (trim) {
4910       emit_split_vector(ctx, vec, 4);
4911       RegClass rc = dst.size() == 3 ? s1 : s2;
4912       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4913                  emit_extract_vector(ctx, vec, 0, rc),
4914                  emit_extract_vector(ctx, vec, 1, rc),
4915                  emit_extract_vector(ctx, vec, 2, rc));
4916
4917    }
4918    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4919 }
4920
4921 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
4922 {
4923    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4924
4925    Builder bld(ctx->program, ctx->block);
4926
4927    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
4928                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
4929                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
4930                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
4931    if (ctx->options->chip_class >= GFX10) {
4932       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4933                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
4934                    S_008F0C_RESOURCE_LEVEL(1);
4935    } else {
4936       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4937                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4938    }
4939
4940    unsigned base = nir_intrinsic_base(instr);
4941    unsigned range = nir_intrinsic_range(instr);
4942
4943    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
4944    if (base && offset.type() == RegType::sgpr)
4945       offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
4946    else if (base && offset.type() == RegType::vgpr)
4947       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
4948
4949    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
4950                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
4951                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
4952                           Operand(desc_type));
4953    unsigned size = instr->dest.ssa.bit_size / 8;
4954    // TODO: get alignment information for subdword constants
4955    unsigned byte_align = size < 4 ? -1 : 0;
4956    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, byte_align);
4957 }
4958
4959 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
4960 {
4961    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
4962       ctx->cf_info.exec_potentially_empty_discard = true;
4963
4964    ctx->program->needs_exact = true;
4965
4966    // TODO: optimize uniform conditions
4967    Builder bld(ctx->program, ctx->block);
4968    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4969    assert(src.regClass() == bld.lm);
4970    src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
4971    bld.pseudo(aco_opcode::p_discard_if, src);
4972    ctx->block->kind |= block_kind_uses_discard_if;
4973    return;
4974 }
4975
4976 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
4977 {
4978    Builder bld(ctx->program, ctx->block);
4979
4980    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
4981       ctx->cf_info.exec_potentially_empty_discard = true;
4982
4983    bool divergent = ctx->cf_info.parent_if.is_divergent ||
4984                     ctx->cf_info.parent_loop.has_divergent_continue;
4985
4986    if (ctx->block->loop_nest_depth &&
4987        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
4988       /* we handle discards the same way as jump instructions */
4989       append_logical_end(ctx->block);
4990
4991       /* in loops, discard behaves like break */
4992       Block *linear_target = ctx->cf_info.parent_loop.exit;
4993       ctx->block->kind |= block_kind_discard;
4994
4995       if (!divergent) {
4996          /* uniform discard - loop ends here */
4997          assert(nir_instr_is_last(&instr->instr));
4998          ctx->block->kind |= block_kind_uniform;
4999          ctx->cf_info.has_branch = true;
5000          bld.branch(aco_opcode::p_branch);
5001          add_linear_edge(ctx->block->index, linear_target);
5002          return;
5003       }
5004
5005       /* we add a break right behind the discard() instructions */
5006       ctx->block->kind |= block_kind_break;
5007       unsigned idx = ctx->block->index;
5008
5009       ctx->cf_info.parent_loop.has_divergent_branch = true;
5010       ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx;
5011
5012       /* remove critical edges from linear CFG */
5013       bld.branch(aco_opcode::p_branch);
5014       Block* break_block = ctx->program->create_and_insert_block();
5015       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5016       break_block->kind |= block_kind_uniform;
5017       add_linear_edge(idx, break_block);
5018       add_linear_edge(break_block->index, linear_target);
5019       bld.reset(break_block);
5020       bld.branch(aco_opcode::p_branch);
5021
5022       Block* continue_block = ctx->program->create_and_insert_block();
5023       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5024       add_linear_edge(idx, continue_block);
5025       append_logical_start(continue_block);
5026       ctx->block = continue_block;
5027
5028       return;
5029    }
5030
5031    /* it can currently happen that NIR doesn't remove the unreachable code */
5032    if (!nir_instr_is_last(&instr->instr)) {
5033       ctx->program->needs_exact = true;
5034       /* save exec somewhere temporarily so that it doesn't get
5035        * overwritten before the discard from outer exec masks */
5036       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
5037       bld.pseudo(aco_opcode::p_discard_if, cond);
5038       ctx->block->kind |= block_kind_uses_discard_if;
5039       return;
5040    }
5041
5042    /* This condition is incorrect for uniformly branched discards in a loop
5043     * predicated by a divergent condition, but the above code catches that case
5044     * and the discard would end up turning into a discard_if.
5045     * For example:
5046     * if (divergent) {
5047     *    while (...) {
5048     *       if (uniform) {
5049     *          discard;
5050     *       }
5051     *    }
5052     * }
5053     */
5054    if (!ctx->cf_info.parent_if.is_divergent) {
5055       /* program just ends here */
5056       ctx->block->kind |= block_kind_uniform;
5057       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
5058               0 /* enabled mask */, 9 /* dest */,
5059               false /* compressed */, true/* done */, true /* valid mask */);
5060       bld.sopp(aco_opcode::s_endpgm);
5061       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5062    } else {
5063       ctx->block->kind |= block_kind_discard;
5064       /* branch and linear edge is added by visit_if() */
5065    }
5066 }
5067
5068 enum aco_descriptor_type {
5069    ACO_DESC_IMAGE,
5070    ACO_DESC_FMASK,
5071    ACO_DESC_SAMPLER,
5072    ACO_DESC_BUFFER,
5073    ACO_DESC_PLANE_0,
5074    ACO_DESC_PLANE_1,
5075    ACO_DESC_PLANE_2,
5076 };
5077
5078 static bool
5079 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
5080    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5081       return false;
5082    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5083    return dim == ac_image_cube ||
5084           dim == ac_image_1darray ||
5085           dim == ac_image_2darray ||
5086           dim == ac_image_2darraymsaa;
5087 }
5088
5089 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
5090                       enum aco_descriptor_type desc_type,
5091                       const nir_tex_instr *tex_instr, bool image, bool write)
5092 {
5093 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5094    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
5095    if (it != ctx->tex_desc.end())
5096       return it->second;
5097 */
5098    Temp index = Temp();
5099    bool index_set = false;
5100    unsigned constant_index = 0;
5101    unsigned descriptor_set;
5102    unsigned base_index;
5103    Builder bld(ctx->program, ctx->block);
5104
5105    if (!deref_instr) {
5106       assert(tex_instr && !image);
5107       descriptor_set = 0;
5108       base_index = tex_instr->sampler_index;
5109    } else {
5110       while(deref_instr->deref_type != nir_deref_type_var) {
5111          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5112          if (!array_size)
5113             array_size = 1;
5114
5115          assert(deref_instr->deref_type == nir_deref_type_array);
5116          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
5117          if (const_value) {
5118             constant_index += array_size * const_value->u32;
5119          } else {
5120             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5121             if (indirect.type() == RegType::vgpr)
5122                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
5123
5124             if (array_size != 1)
5125                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
5126
5127             if (!index_set) {
5128                index = indirect;
5129                index_set = true;
5130             } else {
5131                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5132             }
5133          }
5134
5135          deref_instr = nir_src_as_deref(deref_instr->parent);
5136       }
5137       descriptor_set = deref_instr->var->data.descriptor_set;
5138       base_index = deref_instr->var->data.binding;
5139    }
5140
5141    Temp list = load_desc_ptr(ctx, descriptor_set);
5142    list = convert_pointer_to_64_bit(ctx, list);
5143
5144    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
5145    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
5146    unsigned offset = binding->offset;
5147    unsigned stride = binding->size;
5148    aco_opcode opcode;
5149    RegClass type;
5150
5151    assert(base_index < layout->binding_count);
5152
5153    switch (desc_type) {
5154    case ACO_DESC_IMAGE:
5155       type = s8;
5156       opcode = aco_opcode::s_load_dwordx8;
5157       break;
5158    case ACO_DESC_FMASK:
5159       type = s8;
5160       opcode = aco_opcode::s_load_dwordx8;
5161       offset += 32;
5162       break;
5163    case ACO_DESC_SAMPLER:
5164       type = s4;
5165       opcode = aco_opcode::s_load_dwordx4;
5166       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5167          offset += radv_combined_image_descriptor_sampler_offset(binding);
5168       break;
5169    case ACO_DESC_BUFFER:
5170       type = s4;
5171       opcode = aco_opcode::s_load_dwordx4;
5172       break;
5173    case ACO_DESC_PLANE_0:
5174    case ACO_DESC_PLANE_1:
5175       type = s8;
5176       opcode = aco_opcode::s_load_dwordx8;
5177       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5178       break;
5179    case ACO_DESC_PLANE_2:
5180       type = s4;
5181       opcode = aco_opcode::s_load_dwordx4;
5182       offset += 64;
5183       break;
5184    default:
5185       unreachable("invalid desc_type\n");
5186    }
5187
5188    offset += constant_index * stride;
5189
5190    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5191       (!index_set || binding->immutable_samplers_equal)) {
5192       if (binding->immutable_samplers_equal)
5193          constant_index = 0;
5194
5195       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
5196       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5197                         Operand(samplers[constant_index * 4 + 0]),
5198                         Operand(samplers[constant_index * 4 + 1]),
5199                         Operand(samplers[constant_index * 4 + 2]),
5200                         Operand(samplers[constant_index * 4 + 3]));
5201    }
5202
5203    Operand off;
5204    if (!index_set) {
5205       off = bld.copy(bld.def(s1), Operand(offset));
5206    } else {
5207       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
5208                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
5209    }
5210
5211    Temp res = bld.smem(opcode, bld.def(type), list, off);
5212
5213    if (desc_type == ACO_DESC_PLANE_2) {
5214       Temp components[8];
5215       for (unsigned i = 0; i < 8; i++)
5216          components[i] = bld.tmp(s1);
5217       bld.pseudo(aco_opcode::p_split_vector,
5218                  Definition(components[0]),
5219                  Definition(components[1]),
5220                  Definition(components[2]),
5221                  Definition(components[3]),
5222                  res);
5223
5224       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
5225       bld.pseudo(aco_opcode::p_split_vector,
5226                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5227                  Definition(components[4]),
5228                  Definition(components[5]),
5229                  Definition(components[6]),
5230                  Definition(components[7]),
5231                  desc2);
5232
5233       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
5234                        components[0], components[1], components[2], components[3],
5235                        components[4], components[5], components[6], components[7]);
5236    }
5237
5238    return res;
5239 }
5240
5241 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5242 {
5243    switch (dim) {
5244    case GLSL_SAMPLER_DIM_BUF:
5245       return 1;
5246    case GLSL_SAMPLER_DIM_1D:
5247       return array ? 2 : 1;
5248    case GLSL_SAMPLER_DIM_2D:
5249       return array ? 3 : 2;
5250    case GLSL_SAMPLER_DIM_MS:
5251       return array ? 4 : 3;
5252    case GLSL_SAMPLER_DIM_3D:
5253    case GLSL_SAMPLER_DIM_CUBE:
5254       return 3;
5255    case GLSL_SAMPLER_DIM_RECT:
5256    case GLSL_SAMPLER_DIM_SUBPASS:
5257       return 2;
5258    case GLSL_SAMPLER_DIM_SUBPASS_MS:
5259       return 3;
5260    default:
5261       break;
5262    }
5263    return 0;
5264 }
5265
5266
5267 /* Adjust the sample index according to FMASK.
5268  *
5269  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
5270  * which is the identity mapping. Each nibble says which physical sample
5271  * should be fetched to get that sample.
5272  *
5273  * For example, 0x11111100 means there are only 2 samples stored and
5274  * the second sample covers 3/4 of the pixel. When reading samples 0
5275  * and 1, return physical sample 0 (determined by the first two 0s
5276  * in FMASK), otherwise return physical sample 1.
5277  *
5278  * The sample index should be adjusted as follows:
5279  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
5280  */
5281 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
5282 {
5283    Builder bld(ctx->program, ctx->block);
5284    Temp fmask = bld.tmp(v1);
5285    unsigned dim = ctx->options->chip_class >= GFX10
5286                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
5287                   : 0;
5288
5289    Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) :
5290                      bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]);
5291    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
5292    load->operands[0] = Operand(fmask_desc_ptr);
5293    load->operands[1] = Operand(s4); /* no sampler */
5294    load->operands[2] = Operand(coord);
5295    load->definitions[0] = Definition(fmask);
5296    load->glc = false;
5297    load->dlc = false;
5298    load->dmask = 0x1;
5299    load->unrm = true;
5300    load->da = da;
5301    load->dim = dim;
5302    load->can_reorder = true; /* fmask images shouldn't be modified */
5303    ctx->block->instructions.emplace_back(std::move(load));
5304
5305    Operand sample_index4;
5306    if (sample_index.isConstant() && sample_index.constantValue() < 16) {
5307       sample_index4 = Operand(sample_index.constantValue() << 2);
5308    } else if (sample_index.regClass() == s1) {
5309       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
5310    } else {
5311       assert(sample_index.regClass() == v1);
5312       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
5313    }
5314
5315    Temp final_sample;
5316    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
5317       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
5318    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
5319       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
5320    else
5321       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
5322
5323    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
5324     * resource descriptor is 0 (invalid),
5325     */
5326    Temp compare = bld.tmp(bld.lm);
5327    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
5328                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
5329
5330    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
5331
5332    /* Replace the MSAA sample index. */
5333    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
5334 }
5335
5336 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
5337 {
5338
5339    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
5340    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5341    bool is_array = glsl_sampler_type_is_array(type);
5342    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5343    assert(!add_frag_pos && "Input attachments should be lowered.");
5344    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5345    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
5346    int count = image_type_to_components_count(dim, is_array);
5347    std::vector<Temp> coords(count);
5348    Builder bld(ctx->program, ctx->block);
5349
5350    if (is_ms) {
5351       count--;
5352       Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
5353       /* get sample index */
5354       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
5355          nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
5356          Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
5357          std::vector<Temp> fmask_load_address;
5358          for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
5359             fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
5360
5361          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
5362          coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
5363       } else {
5364          coords[count] = emit_extract_vector(ctx, src2, 0, v1);
5365       }
5366    }
5367
5368    if (gfx9_1d) {
5369       coords[0] = emit_extract_vector(ctx, src0, 0, v1);
5370       coords.resize(coords.size() + 1);
5371       coords[1] = bld.copy(bld.def(v1), Operand(0u));
5372       if (is_array)
5373          coords[2] = emit_extract_vector(ctx, src0, 1, v1);
5374    } else {
5375       for (int i = 0; i < count; i++)
5376          coords[i] = emit_extract_vector(ctx, src0, i, v1);
5377    }
5378
5379    if (instr->intrinsic == nir_intrinsic_image_deref_load ||
5380        instr->intrinsic == nir_intrinsic_image_deref_store) {
5381       int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
5382       bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
5383
5384       if (!level_zero)
5385          coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
5386    }
5387
5388    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5389    for (unsigned i = 0; i < coords.size(); i++)
5390       vec->operands[i] = Operand(coords[i]);
5391    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
5392    vec->definitions[0] = Definition(res);
5393    ctx->block->instructions.emplace_back(std::move(vec));
5394    return res;
5395 }
5396
5397
5398 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
5399 {
5400    Builder bld(ctx->program, ctx->block);
5401    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5402    const struct glsl_type *type = glsl_without_array(var->type);
5403    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5404    bool is_array = glsl_sampler_type_is_array(type);
5405    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5406
5407    if (dim == GLSL_SAMPLER_DIM_BUF) {
5408       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
5409       unsigned num_channels = util_last_bit(mask);
5410       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5411       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5412
5413       aco_opcode opcode;
5414       switch (num_channels) {
5415       case 1:
5416          opcode = aco_opcode::buffer_load_format_x;
5417          break;
5418       case 2:
5419          opcode = aco_opcode::buffer_load_format_xy;
5420          break;
5421       case 3:
5422          opcode = aco_opcode::buffer_load_format_xyz;
5423          break;
5424       case 4:
5425          opcode = aco_opcode::buffer_load_format_xyzw;
5426          break;
5427       default:
5428          unreachable(">4 channel buffer image load");
5429       }
5430       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
5431       load->operands[0] = Operand(rsrc);
5432       load->operands[1] = Operand(vindex);
5433       load->operands[2] = Operand((uint32_t) 0);
5434       Temp tmp;
5435       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5436          tmp = dst;
5437       else
5438          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
5439       load->definitions[0] = Definition(tmp);
5440       load->idxen = true;
5441       load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
5442       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5443       load->barrier = barrier_image;
5444       ctx->block->instructions.emplace_back(std::move(load));
5445
5446       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
5447       return;
5448    }
5449
5450    Temp coords = get_image_coords(ctx, instr, type);
5451    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5452
5453    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
5454    unsigned num_components = util_bitcount(dmask);
5455    Temp tmp;
5456    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5457       tmp = dst;
5458    else
5459       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
5460
5461    bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
5462    aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
5463
5464    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
5465    load->operands[0] = Operand(resource);
5466    load->operands[1] = Operand(s4); /* no sampler */
5467    load->operands[2] = Operand(coords);
5468    load->definitions[0] = Definition(tmp);
5469    load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
5470    load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5471    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5472    load->dmask = dmask;
5473    load->unrm = true;
5474    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5475    load->barrier = barrier_image;
5476    ctx->block->instructions.emplace_back(std::move(load));
5477
5478    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
5479    return;
5480 }
5481
5482 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
5483 {
5484    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5485    const struct glsl_type *type = glsl_without_array(var->type);
5486    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5487    bool is_array = glsl_sampler_type_is_array(type);
5488    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5489
5490    bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
5491
5492    if (dim == GLSL_SAMPLER_DIM_BUF) {
5493       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5494       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5495       aco_opcode opcode;
5496       switch (data.size()) {
5497       case 1:
5498          opcode = aco_opcode::buffer_store_format_x;
5499          break;
5500       case 2:
5501          opcode = aco_opcode::buffer_store_format_xy;
5502          break;
5503       case 3:
5504          opcode = aco_opcode::buffer_store_format_xyz;
5505          break;
5506       case 4:
5507          opcode = aco_opcode::buffer_store_format_xyzw;
5508          break;
5509       default:
5510          unreachable(">4 channel buffer image store");
5511       }
5512       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
5513       store->operands[0] = Operand(rsrc);
5514       store->operands[1] = Operand(vindex);
5515       store->operands[2] = Operand((uint32_t) 0);
5516       store->operands[3] = Operand(data);
5517       store->idxen = true;
5518       store->glc = glc;
5519       store->dlc = false;
5520       store->disable_wqm = true;
5521       store->barrier = barrier_image;
5522       ctx->program->needs_exact = true;
5523       ctx->block->instructions.emplace_back(std::move(store));
5524       return;
5525    }
5526
5527    assert(data.type() == RegType::vgpr);
5528    Temp coords = get_image_coords(ctx, instr, type);
5529    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5530
5531    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
5532    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
5533
5534    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
5535    store->operands[0] = Operand(resource);
5536    store->operands[1] = Operand(data);
5537    store->operands[2] = Operand(coords);
5538    store->glc = glc;
5539    store->dlc = false;
5540    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5541    store->dmask = (1 << data.size()) - 1;
5542    store->unrm = true;
5543    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5544    store->disable_wqm = true;
5545    store->barrier = barrier_image;
5546    ctx->program->needs_exact = true;
5547    ctx->block->instructions.emplace_back(std::move(store));
5548    return;
5549 }
5550
5551 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
5552 {
5553    /* return the previous value if dest is ever used */
5554    bool return_previous = false;
5555    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5556       return_previous = true;
5557       break;
5558    }
5559    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5560       return_previous = true;
5561       break;
5562    }
5563
5564    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5565    const struct glsl_type *type = glsl_without_array(var->type);
5566    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5567    bool is_array = glsl_sampler_type_is_array(type);
5568    Builder bld(ctx->program, ctx->block);
5569
5570    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5571    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
5572
5573    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
5574       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
5575
5576    aco_opcode buf_op, image_op;
5577    switch (instr->intrinsic) {
5578       case nir_intrinsic_image_deref_atomic_add:
5579          buf_op = aco_opcode::buffer_atomic_add;
5580          image_op = aco_opcode::image_atomic_add;
5581          break;
5582       case nir_intrinsic_image_deref_atomic_umin:
5583          buf_op = aco_opcode::buffer_atomic_umin;
5584          image_op = aco_opcode::image_atomic_umin;
5585          break;
5586       case nir_intrinsic_image_deref_atomic_imin:
5587          buf_op = aco_opcode::buffer_atomic_smin;
5588          image_op = aco_opcode::image_atomic_smin;
5589          break;
5590       case nir_intrinsic_image_deref_atomic_umax:
5591          buf_op = aco_opcode::buffer_atomic_umax;
5592          image_op = aco_opcode::image_atomic_umax;
5593          break;
5594       case nir_intrinsic_image_deref_atomic_imax:
5595          buf_op = aco_opcode::buffer_atomic_smax;
5596          image_op = aco_opcode::image_atomic_smax;
5597          break;
5598       case nir_intrinsic_image_deref_atomic_and:
5599          buf_op = aco_opcode::buffer_atomic_and;
5600          image_op = aco_opcode::image_atomic_and;
5601          break;
5602       case nir_intrinsic_image_deref_atomic_or:
5603          buf_op = aco_opcode::buffer_atomic_or;
5604          image_op = aco_opcode::image_atomic_or;
5605          break;
5606       case nir_intrinsic_image_deref_atomic_xor:
5607          buf_op = aco_opcode::buffer_atomic_xor;
5608          image_op = aco_opcode::image_atomic_xor;
5609          break;
5610       case nir_intrinsic_image_deref_atomic_exchange:
5611          buf_op = aco_opcode::buffer_atomic_swap;
5612          image_op = aco_opcode::image_atomic_swap;
5613          break;
5614       case nir_intrinsic_image_deref_atomic_comp_swap:
5615          buf_op = aco_opcode::buffer_atomic_cmpswap;
5616          image_op = aco_opcode::image_atomic_cmpswap;
5617          break;
5618       default:
5619          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
5620    }
5621
5622    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5623
5624    if (dim == GLSL_SAMPLER_DIM_BUF) {
5625       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5626       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5627       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
5628       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
5629       mubuf->operands[0] = Operand(resource);
5630       mubuf->operands[1] = Operand(vindex);
5631       mubuf->operands[2] = Operand((uint32_t)0);
5632       mubuf->operands[3] = Operand(data);
5633       if (return_previous)
5634          mubuf->definitions[0] = Definition(dst);
5635       mubuf->offset = 0;
5636       mubuf->idxen = true;
5637       mubuf->glc = return_previous;
5638       mubuf->dlc = false; /* Not needed for atomics */
5639       mubuf->disable_wqm = true;
5640       mubuf->barrier = barrier_image;
5641       ctx->program->needs_exact = true;
5642       ctx->block->instructions.emplace_back(std::move(mubuf));
5643       return;
5644    }
5645
5646    Temp coords = get_image_coords(ctx, instr, type);
5647    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5648    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
5649    mimg->operands[0] = Operand(resource);
5650    mimg->operands[1] = Operand(data);
5651    mimg->operands[2] = Operand(coords);
5652    if (return_previous)
5653       mimg->definitions[0] = Definition(dst);
5654    mimg->glc = return_previous;
5655    mimg->dlc = false; /* Not needed for atomics */
5656    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5657    mimg->dmask = (1 << data.size()) - 1;
5658    mimg->unrm = true;
5659    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5660    mimg->disable_wqm = true;
5661    mimg->barrier = barrier_image;
5662    ctx->program->needs_exact = true;
5663    ctx->block->instructions.emplace_back(std::move(mimg));
5664    return;
5665 }
5666
5667 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
5668 {
5669    if (in_elements && ctx->options->chip_class == GFX8) {
5670       /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
5671       Builder bld(ctx->program, ctx->block);
5672
5673       Temp size = emit_extract_vector(ctx, desc, 2, s1);
5674
5675       Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
5676       size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
5677
5678       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
5679       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
5680
5681       Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
5682       size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
5683
5684       Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
5685       bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
5686                size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
5687       if (dst.type() == RegType::vgpr)
5688          bld.copy(Definition(dst), shr_dst);
5689
5690       /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
5691    } else {
5692       emit_extract_vector(ctx, desc, 2, dst);
5693    }
5694 }
5695
5696 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
5697 {
5698    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5699    const struct glsl_type *type = glsl_without_array(var->type);
5700    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5701    bool is_array = glsl_sampler_type_is_array(type);
5702    Builder bld(ctx->program, ctx->block);
5703
5704    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
5705       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
5706       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
5707    }
5708
5709    /* LOD */
5710    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
5711
5712    /* Resource */
5713    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
5714
5715    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5716
5717    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
5718    mimg->operands[0] = Operand(resource);
5719    mimg->operands[1] = Operand(s4); /* no sampler */
5720    mimg->operands[2] = Operand(lod);
5721    uint8_t& dmask = mimg->dmask;
5722    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5723    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
5724    mimg->da = glsl_sampler_type_is_array(type);
5725    mimg->can_reorder = true;
5726    Definition& def = mimg->definitions[0];
5727    ctx->block->instructions.emplace_back(std::move(mimg));
5728
5729    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
5730        glsl_sampler_type_is_array(type)) {
5731
5732       assert(instr->dest.ssa.num_components == 3);
5733       Temp tmp = {ctx->program->allocateId(), v3};
5734       def = Definition(tmp);
5735       emit_split_vector(ctx, tmp, 3);
5736
5737       /* divide 3rd value by 6 by multiplying with magic number */
5738       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
5739       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
5740
5741       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5742                  emit_extract_vector(ctx, tmp, 0, v1),
5743                  emit_extract_vector(ctx, tmp, 1, v1),
5744                  by_6);
5745
5746    } else if (ctx->options->chip_class == GFX9 &&
5747               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
5748               glsl_sampler_type_is_array(type)) {
5749       assert(instr->dest.ssa.num_components == 2);
5750       def = Definition(dst);
5751       dmask = 0x5;
5752    } else {
5753       def = Definition(dst);
5754    }
5755
5756    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5757 }
5758
5759 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
5760 {
5761    Builder bld(ctx->program, ctx->block);
5762    unsigned num_components = instr->num_components;
5763
5764    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5765    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5766    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5767
5768    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
5769    unsigned size = instr->dest.ssa.bit_size / 8;
5770    int byte_align = 0;
5771    if (size < 4) {
5772       unsigned align_mul = nir_intrinsic_align_mul(instr);
5773       unsigned align_offset = nir_intrinsic_align_offset(instr);
5774       byte_align = align_mul % 4 == 0 ? align_offset : -1;
5775    }
5776    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), byte_align, glc, false);
5777 }
5778
5779 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
5780 {
5781    Builder bld(ctx->program, ctx->block);
5782    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
5783    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
5784    unsigned writemask = nir_intrinsic_write_mask(instr);
5785    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
5786
5787    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
5788    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5789
5790    bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
5791                ctx->options->chip_class >= GFX8 &&
5792                elem_size_bytes >= 4;
5793    if (smem)
5794       offset = bld.as_uniform(offset);
5795    bool smem_nonfs = smem && ctx->stage != fragment_fs;
5796
5797    while (writemask) {
5798       int start, count;
5799       u_bit_scan_consecutive_range(&writemask, &start, &count);
5800       if (count == 3 && (smem || ctx->options->chip_class == GFX6)) {
5801          /* GFX6 doesn't support storing vec3, split it. */
5802          writemask |= 1u << (start + 2);
5803          count = 2;
5804       }
5805       int num_bytes = count * elem_size_bytes;
5806
5807       /* dword or larger stores have to be dword-aligned */
5808       if (elem_size_bytes < 4 && num_bytes > 2) {
5809          // TODO: improve alignment check of sub-dword stores
5810          unsigned count_new = 2 / elem_size_bytes;
5811          writemask |= ((1 << (count - count_new)) - 1) << (start + count_new);
5812          count = count_new;
5813          num_bytes = 2;
5814       }
5815
5816       if (num_bytes > 16) {
5817          assert(elem_size_bytes == 8);
5818          writemask |= (((count - 2) << 1) - 1) << (start + 2);
5819          count = 2;
5820          num_bytes = 16;
5821       }
5822
5823       Temp write_data;
5824       if (elem_size_bytes < 4) {
5825          if (data.type() == RegType::sgpr) {
5826             data = as_vgpr(ctx, data);
5827             emit_split_vector(ctx, data, 4 * data.size() / elem_size_bytes);
5828          }
5829          RegClass rc = RegClass(RegType::vgpr, elem_size_bytes).as_subdword();
5830          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5831          for (int i = 0; i < count; i++)
5832             vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, rc));
5833          write_data = bld.tmp(RegClass(RegType::vgpr, num_bytes).as_subdword());
5834          vec->definitions[0] = Definition(write_data);
5835          bld.insert(std::move(vec));
5836       } else if (count != instr->num_components) {
5837          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5838          for (int i = 0; i < count; i++) {
5839             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
5840             vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
5841          }
5842          write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
5843          vec->definitions[0] = Definition(write_data);
5844          ctx->block->instructions.emplace_back(std::move(vec));
5845       } else if (!smem && data.type() != RegType::vgpr) {
5846          assert(num_bytes % 4 == 0);
5847          write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
5848       } else if (smem_nonfs && data.type() == RegType::vgpr) {
5849          assert(num_bytes % 4 == 0);
5850          write_data = bld.as_uniform(data);
5851       } else {
5852          write_data = data;
5853       }
5854
5855       aco_opcode vmem_op, smem_op = aco_opcode::last_opcode;
5856       switch (num_bytes) {
5857          case 1:
5858             vmem_op = aco_opcode::buffer_store_byte;
5859             break;
5860          case 2:
5861             vmem_op = aco_opcode::buffer_store_short;
5862             break;
5863          case 4:
5864             vmem_op = aco_opcode::buffer_store_dword;
5865             smem_op = aco_opcode::s_buffer_store_dword;
5866             break;
5867          case 8:
5868             vmem_op = aco_opcode::buffer_store_dwordx2;
5869             smem_op = aco_opcode::s_buffer_store_dwordx2;
5870             break;
5871          case 12:
5872             vmem_op = aco_opcode::buffer_store_dwordx3;
5873             assert(!smem && ctx->options->chip_class > GFX6);
5874             break;
5875          case 16:
5876             vmem_op = aco_opcode::buffer_store_dwordx4;
5877             smem_op = aco_opcode::s_buffer_store_dwordx4;
5878             break;
5879          default:
5880             unreachable("Store SSBO not implemented for this size.");
5881       }
5882       if (ctx->stage == fragment_fs)
5883          smem_op = aco_opcode::p_fs_buffer_store_smem;
5884
5885       if (smem) {
5886          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
5887          store->operands[0] = Operand(rsrc);
5888          if (start) {
5889             Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5890                                 offset, Operand(start * elem_size_bytes));
5891             store->operands[1] = Operand(off);
5892          } else {
5893             store->operands[1] = Operand(offset);
5894          }
5895          if (smem_op != aco_opcode::p_fs_buffer_store_smem)
5896             store->operands[1].setFixed(m0);
5897          store->operands[2] = Operand(write_data);
5898          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
5899          store->dlc = false;
5900          store->disable_wqm = true;
5901          store->barrier = barrier_buffer;
5902          ctx->block->instructions.emplace_back(std::move(store));
5903          ctx->program->wb_smem_l1_on_end = true;
5904          if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
5905             ctx->block->kind |= block_kind_needs_lowering;
5906             ctx->program->needs_exact = true;
5907          }
5908       } else {
5909          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
5910          store->operands[0] = Operand(rsrc);
5911          store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
5912          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
5913          store->operands[3] = Operand(write_data);
5914          store->offset = start * elem_size_bytes;
5915          store->offen = (offset.type() == RegType::vgpr);
5916          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
5917          store->dlc = false;
5918          store->disable_wqm = true;
5919          store->barrier = barrier_buffer;
5920          ctx->program->needs_exact = true;
5921          ctx->block->instructions.emplace_back(std::move(store));
5922       }
5923    }
5924 }
5925
5926 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
5927 {
5928    /* return the previous value if dest is ever used */
5929    bool return_previous = false;
5930    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5931       return_previous = true;
5932       break;
5933    }
5934    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5935       return_previous = true;
5936       break;
5937    }
5938
5939    Builder bld(ctx->program, ctx->block);
5940    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
5941
5942    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
5943       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
5944                         get_ssa_temp(ctx, instr->src[3].ssa), data);
5945
5946    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
5947    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5948    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5949
5950    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5951
5952    aco_opcode op32, op64;
5953    switch (instr->intrinsic) {
5954       case nir_intrinsic_ssbo_atomic_add:
5955          op32 = aco_opcode::buffer_atomic_add;
5956          op64 = aco_opcode::buffer_atomic_add_x2;
5957          break;
5958       case nir_intrinsic_ssbo_atomic_imin:
5959          op32 = aco_opcode::buffer_atomic_smin;
5960          op64 = aco_opcode::buffer_atomic_smin_x2;
5961          break;
5962       case nir_intrinsic_ssbo_atomic_umin:
5963          op32 = aco_opcode::buffer_atomic_umin;
5964          op64 = aco_opcode::buffer_atomic_umin_x2;
5965          break;
5966       case nir_intrinsic_ssbo_atomic_imax:
5967          op32 = aco_opcode::buffer_atomic_smax;
5968          op64 = aco_opcode::buffer_atomic_smax_x2;
5969          break;
5970       case nir_intrinsic_ssbo_atomic_umax:
5971          op32 = aco_opcode::buffer_atomic_umax;
5972          op64 = aco_opcode::buffer_atomic_umax_x2;
5973          break;
5974       case nir_intrinsic_ssbo_atomic_and:
5975          op32 = aco_opcode::buffer_atomic_and;
5976          op64 = aco_opcode::buffer_atomic_and_x2;
5977          break;
5978       case nir_intrinsic_ssbo_atomic_or:
5979          op32 = aco_opcode::buffer_atomic_or;
5980          op64 = aco_opcode::buffer_atomic_or_x2;
5981          break;
5982       case nir_intrinsic_ssbo_atomic_xor:
5983          op32 = aco_opcode::buffer_atomic_xor;
5984          op64 = aco_opcode::buffer_atomic_xor_x2;
5985          break;
5986       case nir_intrinsic_ssbo_atomic_exchange:
5987          op32 = aco_opcode::buffer_atomic_swap;
5988          op64 = aco_opcode::buffer_atomic_swap_x2;
5989          break;
5990       case nir_intrinsic_ssbo_atomic_comp_swap:
5991          op32 = aco_opcode::buffer_atomic_cmpswap;
5992          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
5993          break;
5994       default:
5995          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
5996    }
5997    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
5998    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
5999    mubuf->operands[0] = Operand(rsrc);
6000    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6001    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6002    mubuf->operands[3] = Operand(data);
6003    if (return_previous)
6004       mubuf->definitions[0] = Definition(dst);
6005    mubuf->offset = 0;
6006    mubuf->offen = (offset.type() == RegType::vgpr);
6007    mubuf->glc = return_previous;
6008    mubuf->dlc = false; /* Not needed for atomics */
6009    mubuf->disable_wqm = true;
6010    mubuf->barrier = barrier_buffer;
6011    ctx->program->needs_exact = true;
6012    ctx->block->instructions.emplace_back(std::move(mubuf));
6013 }
6014
6015 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
6016
6017    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6018    Builder bld(ctx->program, ctx->block);
6019    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
6020    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
6021 }
6022
6023 Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
6024 {
6025    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6026                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6027
6028    if (addr.type() == RegType::vgpr)
6029       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
6030    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
6031 }
6032
6033 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
6034 {
6035    Builder bld(ctx->program, ctx->block);
6036    unsigned num_components = instr->num_components;
6037    unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
6038
6039    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6040    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6041
6042    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6043    bool dlc = glc && ctx->options->chip_class >= GFX10;
6044    aco_opcode op;
6045    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
6046       bool global = ctx->options->chip_class >= GFX9;
6047
6048       if (ctx->options->chip_class >= GFX7) {
6049          aco_opcode op;
6050          switch (num_bytes) {
6051          case 4:
6052             op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
6053             break;
6054          case 8:
6055             op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
6056             break;
6057          case 12:
6058             op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
6059             break;
6060          case 16:
6061             op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
6062             break;
6063          default:
6064             unreachable("load_global not implemented for this size.");
6065          }
6066
6067          aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
6068          flat->operands[0] = Operand(addr);
6069          flat->operands[1] = Operand(s1);
6070          flat->glc = glc;
6071          flat->dlc = dlc;
6072          flat->barrier = barrier_buffer;
6073
6074          if (dst.type() == RegType::sgpr) {
6075             Temp vec = bld.tmp(RegType::vgpr, dst.size());
6076             flat->definitions[0] = Definition(vec);
6077             ctx->block->instructions.emplace_back(std::move(flat));
6078             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
6079          } else {
6080             flat->definitions[0] = Definition(dst);
6081             ctx->block->instructions.emplace_back(std::move(flat));
6082          }
6083          emit_split_vector(ctx, dst, num_components);
6084       } else {
6085          assert(ctx->options->chip_class == GFX6);
6086
6087          /* GFX6 doesn't support loading vec3, expand to vec4. */
6088          num_bytes = num_bytes == 12 ? 16 : num_bytes;
6089
6090          aco_opcode op;
6091          switch (num_bytes) {
6092          case 4:
6093             op = aco_opcode::buffer_load_dword;
6094             break;
6095          case 8:
6096             op = aco_opcode::buffer_load_dwordx2;
6097             break;
6098          case 16:
6099             op = aco_opcode::buffer_load_dwordx4;
6100             break;
6101          default:
6102             unreachable("load_global not implemented for this size.");
6103          }
6104
6105          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6106
6107          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6108          mubuf->operands[0] = Operand(rsrc);
6109          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6110          mubuf->operands[2] = Operand(0u);
6111          mubuf->glc = glc;
6112          mubuf->dlc = false;
6113          mubuf->offset = 0;
6114          mubuf->addr64 = addr.type() == RegType::vgpr;
6115          mubuf->disable_wqm = false;
6116          mubuf->barrier = barrier_buffer;
6117          aco_ptr<Instruction> instr = std::move(mubuf);
6118
6119          /* expand vector */
6120          if (dst.size() == 3) {
6121             Temp vec = bld.tmp(v4);
6122             instr->definitions[0] = Definition(vec);
6123             bld.insert(std::move(instr));
6124             emit_split_vector(ctx, vec, 4);
6125
6126             instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1));
6127             instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1));
6128             instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1));
6129             instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1));
6130          }
6131
6132          if (dst.type() == RegType::sgpr) {
6133             Temp vec = bld.tmp(RegType::vgpr, dst.size());
6134             instr->definitions[0] = Definition(vec);
6135             bld.insert(std::move(instr));
6136             expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
6137             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
6138          } else {
6139             instr->definitions[0] = Definition(dst);
6140             bld.insert(std::move(instr));
6141             emit_split_vector(ctx, dst, num_components);
6142          }
6143       }
6144    } else {
6145       switch (num_bytes) {
6146          case 4:
6147             op = aco_opcode::s_load_dword;
6148             break;
6149          case 8:
6150             op = aco_opcode::s_load_dwordx2;
6151             break;
6152          case 12:
6153          case 16:
6154             op = aco_opcode::s_load_dwordx4;
6155             break;
6156          default:
6157             unreachable("load_global not implemented for this size.");
6158       }
6159       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
6160       load->operands[0] = Operand(addr);
6161       load->operands[1] = Operand(0u);
6162       load->definitions[0] = Definition(dst);
6163       load->glc = glc;
6164       load->dlc = dlc;
6165       load->barrier = barrier_buffer;
6166       assert(ctx->options->chip_class >= GFX8 || !glc);
6167
6168       if (dst.size() == 3) {
6169          /* trim vector */
6170          Temp vec = bld.tmp(s4);
6171          load->definitions[0] = Definition(vec);
6172          ctx->block->instructions.emplace_back(std::move(load));
6173          emit_split_vector(ctx, vec, 4);
6174
6175          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6176                     emit_extract_vector(ctx, vec, 0, s1),
6177                     emit_extract_vector(ctx, vec, 1, s1),
6178                     emit_extract_vector(ctx, vec, 2, s1));
6179       } else {
6180          ctx->block->instructions.emplace_back(std::move(load));
6181       }
6182    }
6183 }
6184
6185 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
6186 {
6187    Builder bld(ctx->program, ctx->block);
6188    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6189
6190    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6191    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6192
6193    if (ctx->options->chip_class >= GFX7)
6194       addr = as_vgpr(ctx, addr);
6195
6196    unsigned writemask = nir_intrinsic_write_mask(instr);
6197    while (writemask) {
6198       int start, count;
6199       u_bit_scan_consecutive_range(&writemask, &start, &count);
6200       if (count == 3 && ctx->options->chip_class == GFX6) {
6201          /* GFX6 doesn't support storing vec3, split it. */
6202          writemask |= 1u << (start + 2);
6203          count = 2;
6204       }
6205       unsigned num_bytes = count * elem_size_bytes;
6206
6207       Temp write_data = data;
6208       if (count != instr->num_components) {
6209          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
6210          for (int i = 0; i < count; i++)
6211             vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
6212          write_data = bld.tmp(RegType::vgpr, count);
6213          vec->definitions[0] = Definition(write_data);
6214          ctx->block->instructions.emplace_back(std::move(vec));
6215       }
6216
6217       bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6218       unsigned offset = start * elem_size_bytes;
6219
6220       if (ctx->options->chip_class >= GFX7) {
6221          if (offset > 0 && ctx->options->chip_class < GFX9) {
6222             Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6223             Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6224             Temp carry = bld.tmp(bld.lm);
6225             bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6226
6227             bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
6228                      Operand(offset), addr0);
6229             bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6230                      Operand(0u), addr1,
6231                      carry).def(1).setHint(vcc);
6232
6233             addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6234
6235             offset = 0;
6236          }
6237
6238          bool global = ctx->options->chip_class >= GFX9;
6239          aco_opcode op;
6240          switch (num_bytes) {
6241          case 4:
6242             op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
6243             break;
6244          case 8:
6245             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6246             break;
6247          case 12:
6248             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6249             break;
6250          case 16:
6251             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6252             break;
6253          default:
6254             unreachable("store_global not implemented for this size.");
6255          }
6256
6257          aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6258          flat->operands[0] = Operand(addr);
6259          flat->operands[1] = Operand(s1);
6260          flat->operands[2] = Operand(data);
6261          flat->glc = glc;
6262          flat->dlc = false;
6263          flat->offset = offset;
6264          flat->disable_wqm = true;
6265          flat->barrier = barrier_buffer;
6266          ctx->program->needs_exact = true;
6267          ctx->block->instructions.emplace_back(std::move(flat));
6268       } else {
6269          assert(ctx->options->chip_class == GFX6);
6270
6271          aco_opcode op;
6272          switch (num_bytes) {
6273          case 4:
6274             op = aco_opcode::buffer_store_dword;
6275             break;
6276          case 8:
6277             op = aco_opcode::buffer_store_dwordx2;
6278             break;
6279          case 16:
6280             op = aco_opcode::buffer_store_dwordx4;
6281             break;
6282          default:
6283             unreachable("store_global not implemented for this size.");
6284          }
6285
6286          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6287
6288          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6289          mubuf->operands[0] = Operand(rsrc);
6290          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6291          mubuf->operands[2] = Operand(0u);
6292          mubuf->operands[3] = Operand(write_data);
6293          mubuf->glc = glc;
6294          mubuf->dlc = false;
6295          mubuf->offset = offset;
6296          mubuf->addr64 = addr.type() == RegType::vgpr;
6297          mubuf->disable_wqm = true;
6298          mubuf->barrier = barrier_buffer;
6299          ctx->program->needs_exact = true;
6300          ctx->block->instructions.emplace_back(std::move(mubuf));
6301       }
6302    }
6303 }
6304
6305 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6306 {
6307    /* return the previous value if dest is ever used */
6308    bool return_previous = false;
6309    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6310       return_previous = true;
6311       break;
6312    }
6313    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6314       return_previous = true;
6315       break;
6316    }
6317
6318    Builder bld(ctx->program, ctx->block);
6319    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6320    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6321
6322    if (ctx->options->chip_class >= GFX7)
6323       addr = as_vgpr(ctx, addr);
6324
6325    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6326       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6327                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6328
6329    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6330
6331    aco_opcode op32, op64;
6332
6333    if (ctx->options->chip_class >= GFX7) {
6334       bool global = ctx->options->chip_class >= GFX9;
6335       switch (instr->intrinsic) {
6336          case nir_intrinsic_global_atomic_add:
6337             op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6338             op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6339             break;
6340          case nir_intrinsic_global_atomic_imin:
6341             op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6342             op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6343             break;
6344          case nir_intrinsic_global_atomic_umin:
6345             op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6346             op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6347             break;
6348          case nir_intrinsic_global_atomic_imax:
6349             op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6350             op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6351             break;
6352          case nir_intrinsic_global_atomic_umax:
6353             op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6354             op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6355             break;
6356          case nir_intrinsic_global_atomic_and:
6357             op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6358             op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6359             break;
6360          case nir_intrinsic_global_atomic_or:
6361             op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6362             op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6363             break;
6364          case nir_intrinsic_global_atomic_xor:
6365             op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6366             op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6367             break;
6368          case nir_intrinsic_global_atomic_exchange:
6369             op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6370             op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6371             break;
6372          case nir_intrinsic_global_atomic_comp_swap:
6373             op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6374             op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6375             break;
6376          default:
6377             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6378       }
6379
6380       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6381       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6382       flat->operands[0] = Operand(addr);
6383       flat->operands[1] = Operand(s1);
6384       flat->operands[2] = Operand(data);
6385       if (return_previous)
6386          flat->definitions[0] = Definition(dst);
6387       flat->glc = return_previous;
6388       flat->dlc = false; /* Not needed for atomics */
6389       flat->offset = 0;
6390       flat->disable_wqm = true;
6391       flat->barrier = barrier_buffer;
6392       ctx->program->needs_exact = true;
6393       ctx->block->instructions.emplace_back(std::move(flat));
6394    } else {
6395       assert(ctx->options->chip_class == GFX6);
6396
6397       switch (instr->intrinsic) {
6398          case nir_intrinsic_global_atomic_add:
6399             op32 = aco_opcode::buffer_atomic_add;
6400             op64 = aco_opcode::buffer_atomic_add_x2;
6401             break;
6402          case nir_intrinsic_global_atomic_imin:
6403             op32 = aco_opcode::buffer_atomic_smin;
6404             op64 = aco_opcode::buffer_atomic_smin_x2;
6405             break;
6406          case nir_intrinsic_global_atomic_umin:
6407             op32 = aco_opcode::buffer_atomic_umin;
6408             op64 = aco_opcode::buffer_atomic_umin_x2;
6409             break;
6410          case nir_intrinsic_global_atomic_imax:
6411             op32 = aco_opcode::buffer_atomic_smax;
6412             op64 = aco_opcode::buffer_atomic_smax_x2;
6413             break;
6414          case nir_intrinsic_global_atomic_umax:
6415             op32 = aco_opcode::buffer_atomic_umax;
6416             op64 = aco_opcode::buffer_atomic_umax_x2;
6417             break;
6418          case nir_intrinsic_global_atomic_and:
6419             op32 = aco_opcode::buffer_atomic_and;
6420             op64 = aco_opcode::buffer_atomic_and_x2;
6421             break;
6422          case nir_intrinsic_global_atomic_or:
6423             op32 = aco_opcode::buffer_atomic_or;
6424             op64 = aco_opcode::buffer_atomic_or_x2;
6425             break;
6426          case nir_intrinsic_global_atomic_xor:
6427             op32 = aco_opcode::buffer_atomic_xor;
6428             op64 = aco_opcode::buffer_atomic_xor_x2;
6429             break;
6430          case nir_intrinsic_global_atomic_exchange:
6431             op32 = aco_opcode::buffer_atomic_swap;
6432             op64 = aco_opcode::buffer_atomic_swap_x2;
6433             break;
6434          case nir_intrinsic_global_atomic_comp_swap:
6435             op32 = aco_opcode::buffer_atomic_cmpswap;
6436             op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6437             break;
6438          default:
6439             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6440       }
6441
6442       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6443
6444       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6445
6446       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6447       mubuf->operands[0] = Operand(rsrc);
6448       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6449       mubuf->operands[2] = Operand(0u);
6450       mubuf->operands[3] = Operand(data);
6451       if (return_previous)
6452          mubuf->definitions[0] = Definition(dst);
6453       mubuf->glc = return_previous;
6454       mubuf->dlc = false;
6455       mubuf->offset = 0;
6456       mubuf->addr64 = addr.type() == RegType::vgpr;
6457       mubuf->disable_wqm = true;
6458       mubuf->barrier = barrier_buffer;
6459       ctx->program->needs_exact = true;
6460       ctx->block->instructions.emplace_back(std::move(mubuf));
6461    }
6462 }
6463
6464 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
6465    Builder bld(ctx->program, ctx->block);
6466    switch(instr->intrinsic) {
6467       case nir_intrinsic_group_memory_barrier:
6468       case nir_intrinsic_memory_barrier:
6469          bld.barrier(aco_opcode::p_memory_barrier_common);
6470          break;
6471       case nir_intrinsic_memory_barrier_buffer:
6472          bld.barrier(aco_opcode::p_memory_barrier_buffer);
6473          break;
6474       case nir_intrinsic_memory_barrier_image:
6475          bld.barrier(aco_opcode::p_memory_barrier_image);
6476          break;
6477       case nir_intrinsic_memory_barrier_tcs_patch:
6478       case nir_intrinsic_memory_barrier_shared:
6479          bld.barrier(aco_opcode::p_memory_barrier_shared);
6480          break;
6481       default:
6482          unreachable("Unimplemented memory barrier intrinsic");
6483          break;
6484    }
6485 }
6486
6487 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6488 {
6489    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
6490    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6491    assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
6492    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6493    Builder bld(ctx->program, ctx->block);
6494
6495    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
6496    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6497    load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
6498 }
6499
6500 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6501 {
6502    unsigned writemask = nir_intrinsic_write_mask(instr);
6503    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6504    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6505    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6506    assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
6507
6508    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6509    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
6510 }
6511
6512 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6513 {
6514    unsigned offset = nir_intrinsic_base(instr);
6515    Operand m = load_lds_size_m0(ctx);
6516    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6517    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6518
6519    unsigned num_operands = 3;
6520    aco_opcode op32, op64, op32_rtn, op64_rtn;
6521    switch(instr->intrinsic) {
6522       case nir_intrinsic_shared_atomic_add:
6523          op32 = aco_opcode::ds_add_u32;
6524          op64 = aco_opcode::ds_add_u64;
6525          op32_rtn = aco_opcode::ds_add_rtn_u32;
6526          op64_rtn = aco_opcode::ds_add_rtn_u64;
6527          break;
6528       case nir_intrinsic_shared_atomic_imin:
6529          op32 = aco_opcode::ds_min_i32;
6530          op64 = aco_opcode::ds_min_i64;
6531          op32_rtn = aco_opcode::ds_min_rtn_i32;
6532          op64_rtn = aco_opcode::ds_min_rtn_i64;
6533          break;
6534       case nir_intrinsic_shared_atomic_umin:
6535          op32 = aco_opcode::ds_min_u32;
6536          op64 = aco_opcode::ds_min_u64;
6537          op32_rtn = aco_opcode::ds_min_rtn_u32;
6538          op64_rtn = aco_opcode::ds_min_rtn_u64;
6539          break;
6540       case nir_intrinsic_shared_atomic_imax:
6541          op32 = aco_opcode::ds_max_i32;
6542          op64 = aco_opcode::ds_max_i64;
6543          op32_rtn = aco_opcode::ds_max_rtn_i32;
6544          op64_rtn = aco_opcode::ds_max_rtn_i64;
6545          break;
6546       case nir_intrinsic_shared_atomic_umax:
6547          op32 = aco_opcode::ds_max_u32;
6548          op64 = aco_opcode::ds_max_u64;
6549          op32_rtn = aco_opcode::ds_max_rtn_u32;
6550          op64_rtn = aco_opcode::ds_max_rtn_u64;
6551          break;
6552       case nir_intrinsic_shared_atomic_and:
6553          op32 = aco_opcode::ds_and_b32;
6554          op64 = aco_opcode::ds_and_b64;
6555          op32_rtn = aco_opcode::ds_and_rtn_b32;
6556          op64_rtn = aco_opcode::ds_and_rtn_b64;
6557          break;
6558       case nir_intrinsic_shared_atomic_or:
6559          op32 = aco_opcode::ds_or_b32;
6560          op64 = aco_opcode::ds_or_b64;
6561          op32_rtn = aco_opcode::ds_or_rtn_b32;
6562          op64_rtn = aco_opcode::ds_or_rtn_b64;
6563          break;
6564       case nir_intrinsic_shared_atomic_xor:
6565          op32 = aco_opcode::ds_xor_b32;
6566          op64 = aco_opcode::ds_xor_b64;
6567          op32_rtn = aco_opcode::ds_xor_rtn_b32;
6568          op64_rtn = aco_opcode::ds_xor_rtn_b64;
6569          break;
6570       case nir_intrinsic_shared_atomic_exchange:
6571          op32 = aco_opcode::ds_write_b32;
6572          op64 = aco_opcode::ds_write_b64;
6573          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
6574          op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
6575          break;
6576       case nir_intrinsic_shared_atomic_comp_swap:
6577          op32 = aco_opcode::ds_cmpst_b32;
6578          op64 = aco_opcode::ds_cmpst_b64;
6579          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
6580          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
6581          num_operands = 4;
6582          break;
6583       default:
6584          unreachable("Unhandled shared atomic intrinsic");
6585    }
6586
6587    /* return the previous value if dest is ever used */
6588    bool return_previous = false;
6589    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6590       return_previous = true;
6591       break;
6592    }
6593    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6594       return_previous = true;
6595       break;
6596    }
6597
6598    aco_opcode op;
6599    if (data.size() == 1) {
6600       assert(instr->dest.ssa.bit_size == 32);
6601       op = return_previous ? op32_rtn : op32;
6602    } else {
6603       assert(instr->dest.ssa.bit_size == 64);
6604       op = return_previous ? op64_rtn : op64;
6605    }
6606
6607    if (offset > 65535) {
6608       Builder bld(ctx->program, ctx->block);
6609       address = bld.vadd32(bld.def(v1), Operand(offset), address);
6610       offset = 0;
6611    }
6612
6613    aco_ptr<DS_instruction> ds;
6614    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
6615    ds->operands[0] = Operand(address);
6616    ds->operands[1] = Operand(data);
6617    if (num_operands == 4)
6618       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
6619    ds->operands[num_operands - 1] = m;
6620    ds->offset0 = offset;
6621    if (return_previous)
6622       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
6623    ctx->block->instructions.emplace_back(std::move(ds));
6624 }
6625
6626 Temp get_scratch_resource(isel_context *ctx)
6627 {
6628    Builder bld(ctx->program, ctx->block);
6629    Temp scratch_addr = ctx->program->private_segment_buffer;
6630    if (ctx->stage != compute_cs)
6631       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
6632
6633    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
6634                         S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
6635
6636    if (ctx->program->chip_class >= GFX10) {
6637       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
6638                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
6639                    S_008F0C_RESOURCE_LEVEL(1);
6640    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
6641       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6642                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6643    }
6644
6645    /* older generations need element size = 16 bytes. element size removed in GFX9 */
6646    if (ctx->program->chip_class <= GFX8)
6647       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
6648
6649    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
6650 }
6651
6652 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6653    assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
6654    Builder bld(ctx->program, ctx->block);
6655    Temp rsrc = get_scratch_resource(ctx);
6656    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6657    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6658
6659    aco_opcode op;
6660    switch (dst.size()) {
6661       case 1:
6662          op = aco_opcode::buffer_load_dword;
6663          break;
6664       case 2:
6665          op = aco_opcode::buffer_load_dwordx2;
6666          break;
6667       case 3:
6668          op = aco_opcode::buffer_load_dwordx3;
6669          break;
6670       case 4:
6671          op = aco_opcode::buffer_load_dwordx4;
6672          break;
6673       case 6:
6674       case 8: {
6675          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
6676          Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
6677                                 bld.def(v4), rsrc, offset,
6678                                 ctx->program->scratch_offset, 0, true);
6679          Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
6680                                                   aco_opcode::buffer_load_dwordx4,
6681                                 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
6682                                 rsrc, offset, ctx->program->scratch_offset, 16, true);
6683          emit_split_vector(ctx, lower, 2);
6684          elems[0] = emit_extract_vector(ctx, lower, 0, v2);
6685          elems[1] = emit_extract_vector(ctx, lower, 1, v2);
6686          if (dst.size() == 8) {
6687             emit_split_vector(ctx, upper, 2);
6688             elems[2] = emit_extract_vector(ctx, upper, 0, v2);
6689             elems[3] = emit_extract_vector(ctx, upper, 1, v2);
6690          } else {
6691             elems[2] = upper;
6692          }
6693
6694          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6695                                                                          Format::PSEUDO, dst.size() / 2, 1)};
6696          for (unsigned i = 0; i < dst.size() / 2; i++)
6697             vec->operands[i] = Operand(elems[i]);
6698          vec->definitions[0] = Definition(dst);
6699          bld.insert(std::move(vec));
6700          ctx->allocated_vec.emplace(dst.id(), elems);
6701          return;
6702       }
6703       default:
6704          unreachable("Wrong dst size for nir_intrinsic_load_scratch");
6705    }
6706
6707    bld.mubuf(op, Definition(dst), rsrc, offset, ctx->program->scratch_offset, 0, true);
6708    emit_split_vector(ctx, dst, instr->num_components);
6709 }
6710
6711 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6712    assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
6713    Builder bld(ctx->program, ctx->block);
6714    Temp rsrc = get_scratch_resource(ctx);
6715    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6716    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6717
6718    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6719    unsigned writemask = nir_intrinsic_write_mask(instr);
6720
6721    while (writemask) {
6722       int start, count;
6723       u_bit_scan_consecutive_range(&writemask, &start, &count);
6724       int num_bytes = count * elem_size_bytes;
6725
6726       if (num_bytes > 16) {
6727          assert(elem_size_bytes == 8);
6728          writemask |= (((count - 2) << 1) - 1) << (start + 2);
6729          count = 2;
6730          num_bytes = 16;
6731       }
6732
6733       // TODO: check alignment of sub-dword stores
6734       // TODO: split 3 bytes. there is no store instruction for that
6735
6736       Temp write_data;
6737       if (count != instr->num_components) {
6738          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
6739          for (int i = 0; i < count; i++) {
6740             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
6741             vec->operands[i] = Operand(elem);
6742          }
6743          write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
6744          vec->definitions[0] = Definition(write_data);
6745          ctx->block->instructions.emplace_back(std::move(vec));
6746       } else {
6747          write_data = data;
6748       }
6749
6750       aco_opcode op;
6751       switch (num_bytes) {
6752          case 4:
6753             op = aco_opcode::buffer_store_dword;
6754             break;
6755          case 8:
6756             op = aco_opcode::buffer_store_dwordx2;
6757             break;
6758          case 12:
6759             op = aco_opcode::buffer_store_dwordx3;
6760             break;
6761          case 16:
6762             op = aco_opcode::buffer_store_dwordx4;
6763             break;
6764          default:
6765             unreachable("Invalid data size for nir_intrinsic_store_scratch.");
6766       }
6767
6768       bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
6769    }
6770 }
6771
6772 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
6773    uint8_t log2_ps_iter_samples;
6774    if (ctx->program->info->ps.force_persample) {
6775       log2_ps_iter_samples =
6776          util_logbase2(ctx->options->key.fs.num_samples);
6777    } else {
6778       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
6779    }
6780
6781    /* The bit pattern matches that used by fixed function fragment
6782     * processing. */
6783    static const unsigned ps_iter_masks[] = {
6784       0xffff, /* not used */
6785       0x5555,
6786       0x1111,
6787       0x0101,
6788       0x0001,
6789    };
6790    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
6791
6792    Builder bld(ctx->program, ctx->block);
6793
6794    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
6795                              get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
6796    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
6797    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
6798    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6799    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
6800 }
6801
6802 void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
6803    Builder bld(ctx->program, ctx->block);
6804
6805    unsigned stream = nir_intrinsic_stream_id(instr);
6806    Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6807    next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
6808    nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
6809
6810    /* get GSVS ring */
6811    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
6812
6813    unsigned num_components =
6814       ctx->program->info->gs.num_stream_output_components[stream];
6815    assert(num_components);
6816
6817    unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
6818    unsigned stream_offset = 0;
6819    for (unsigned i = 0; i < stream; i++) {
6820       unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
6821       stream_offset += prev_stride * ctx->program->wave_size;
6822    }
6823
6824    /* Limit on the stride field for <= GFX7. */
6825    assert(stride < (1 << 14));
6826
6827    Temp gsvs_dwords[4];
6828    for (unsigned i = 0; i < 4; i++)
6829       gsvs_dwords[i] = bld.tmp(s1);
6830    bld.pseudo(aco_opcode::p_split_vector,
6831               Definition(gsvs_dwords[0]),
6832               Definition(gsvs_dwords[1]),
6833               Definition(gsvs_dwords[2]),
6834               Definition(gsvs_dwords[3]),
6835               gsvs_ring);
6836
6837    if (stream_offset) {
6838       Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
6839
6840       Temp carry = bld.tmp(s1);
6841       gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
6842       gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
6843    }
6844
6845    gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
6846    gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
6847
6848    gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6849                           gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
6850
6851    unsigned offset = 0;
6852    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
6853       if (ctx->program->info->gs.output_streams[i] != stream)
6854          continue;
6855
6856       for (unsigned j = 0; j < 4; j++) {
6857          if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
6858             continue;
6859
6860          if (ctx->outputs.mask[i] & (1 << j)) {
6861             Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
6862             unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
6863             if (const_offset >= 4096u) {
6864                if (vaddr_offset.isUndefined())
6865                   vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
6866                else
6867                   vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
6868                const_offset %= 4096u;
6869             }
6870
6871             aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
6872             mtbuf->operands[0] = Operand(gsvs_ring);
6873             mtbuf->operands[1] = vaddr_offset;
6874             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
6875             mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
6876             mtbuf->offen = !vaddr_offset.isUndefined();
6877             mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
6878             mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
6879             mtbuf->offset = const_offset;
6880             mtbuf->glc = true;
6881             mtbuf->slc = true;
6882             mtbuf->barrier = barrier_gs_data;
6883             mtbuf->can_reorder = true;
6884             bld.insert(std::move(mtbuf));
6885          }
6886
6887          offset += ctx->shader->info.gs.vertices_out;
6888       }
6889
6890       /* outputs for the next vertex are undefined and keeping them around can
6891        * create invalid IR with control flow */
6892       ctx->outputs.mask[i] = 0;
6893    }
6894
6895    bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
6896 }
6897
6898 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
6899 {
6900    Builder bld(ctx->program, ctx->block);
6901
6902    if (cluster_size == 1) {
6903       return src;
6904    } if (op == nir_op_iand && cluster_size == 4) {
6905       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
6906       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
6907       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
6908                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
6909    } else if (op == nir_op_ior && cluster_size == 4) {
6910       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
6911       return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
6912                       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
6913    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
6914       //subgroupAnd(val) -> (exec & ~val) == 0
6915       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
6916       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
6917       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
6918    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
6919       //subgroupOr(val) -> (val & exec) != 0
6920       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
6921       return bool_to_vector_condition(ctx, tmp);
6922    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
6923       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
6924       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
6925       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
6926       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
6927       return bool_to_vector_condition(ctx, tmp);
6928    } else {
6929       //subgroupClustered{And,Or,Xor}(val, n) ->
6930       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ;  just v_mbcnt_lo_u32_b32 on wave32
6931       //cluster_offset = ~(n - 1) & lane_id
6932       //cluster_mask = ((1 << n) - 1)
6933       //subgroupClusteredAnd():
6934       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
6935       //subgroupClusteredOr():
6936       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
6937       //subgroupClusteredXor():
6938       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
6939       Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
6940       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
6941
6942       Temp tmp;
6943       if (op == nir_op_iand)
6944          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
6945       else
6946          tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
6947
6948       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
6949
6950       if (ctx->program->chip_class <= GFX7)
6951          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
6952       else if (ctx->program->wave_size == 64)
6953          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
6954       else
6955          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
6956       tmp = emit_extract_vector(ctx, tmp, 0, v1);
6957       if (cluster_mask != 0xffffffff)
6958          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
6959
6960       Definition cmp_def = Definition();
6961       if (op == nir_op_iand) {
6962          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
6963       } else if (op == nir_op_ior) {
6964          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
6965       } else if (op == nir_op_ixor) {
6966          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
6967                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
6968          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
6969       }
6970       cmp_def.setHint(vcc);
6971       return cmp_def.getTemp();
6972    }
6973 }
6974
6975 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
6976 {
6977    Builder bld(ctx->program, ctx->block);
6978
6979    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
6980    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
6981    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
6982    Temp tmp;
6983    if (op == nir_op_iand)
6984       tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
6985    else
6986       tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
6987
6988    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
6989    Temp lo = lohi.def(0).getTemp();
6990    Temp hi = lohi.def(1).getTemp();
6991    Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi));
6992
6993    Definition cmp_def = Definition();
6994    if (op == nir_op_iand)
6995       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
6996    else if (op == nir_op_ior)
6997       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
6998    else if (op == nir_op_ixor)
6999       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
7000                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
7001    cmp_def.setHint(vcc);
7002    return cmp_def.getTemp();
7003 }
7004
7005 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
7006 {
7007    Builder bld(ctx->program, ctx->block);
7008
7009    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7010    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7011    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7012    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7013    if (op == nir_op_iand)
7014       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7015    else if (op == nir_op_ior)
7016       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7017    else if (op == nir_op_ixor)
7018       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7019
7020    assert(false);
7021    return Temp();
7022 }
7023
7024 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
7025 {
7026    Builder bld(ctx->program, ctx->block);
7027    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7028    if (src.regClass().type() == RegType::vgpr) {
7029       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7030    } else if (src.regClass() == s1) {
7031       bld.sop1(aco_opcode::s_mov_b32, dst, src);
7032    } else if (src.regClass() == s2) {
7033       bld.sop1(aco_opcode::s_mov_b64, dst, src);
7034    } else {
7035       fprintf(stderr, "Unimplemented NIR instr bit size: ");
7036       nir_print_instr(&instr->instr, stderr);
7037       fprintf(stderr, "\n");
7038    }
7039 }
7040
7041 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
7042 {
7043    Builder bld(ctx->program, ctx->block);
7044    Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
7045    Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
7046    Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
7047
7048    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7049    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7050    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7051    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7052
7053    /* Build DD X/Y */
7054    if (ctx->program->chip_class >= GFX8) {
7055       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7056       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7057       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7058       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7059       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7060       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7061    } else {
7062       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7063       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7064       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7065       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7066       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7067       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7068       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7069       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7070       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7071       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7072    }
7073
7074    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7075    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
7076    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
7077    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
7078    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
7079    Temp wqm1 = bld.tmp(v1);
7080    emit_wqm(ctx, tmp1, wqm1, true);
7081    Temp wqm2 = bld.tmp(v1);
7082    emit_wqm(ctx, tmp2, wqm2, true);
7083    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7084    return;
7085 }
7086
7087 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
7088 {
7089    Builder bld(ctx->program, ctx->block);
7090    switch(instr->intrinsic) {
7091    case nir_intrinsic_load_barycentric_sample:
7092    case nir_intrinsic_load_barycentric_pixel:
7093    case nir_intrinsic_load_barycentric_centroid: {
7094       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7095       Temp bary = Temp(0, s2);
7096       switch (mode) {
7097       case INTERP_MODE_SMOOTH:
7098       case INTERP_MODE_NONE:
7099          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7100             bary = get_arg(ctx, ctx->args->ac.persp_center);
7101          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7102             bary = ctx->persp_centroid;
7103          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7104             bary = get_arg(ctx, ctx->args->ac.persp_sample);
7105          break;
7106       case INTERP_MODE_NOPERSPECTIVE:
7107          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7108             bary = get_arg(ctx, ctx->args->ac.linear_center);
7109          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7110             bary = ctx->linear_centroid;
7111          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7112             bary = get_arg(ctx, ctx->args->ac.linear_sample);
7113          break;
7114       default:
7115          break;
7116       }
7117       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7118       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7119       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7120       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7121                  Operand(p1), Operand(p2));
7122       emit_split_vector(ctx, dst, 2);
7123       break;
7124    }
7125    case nir_intrinsic_load_barycentric_model: {
7126       Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7127
7128       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7129       Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7130       Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7131       Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7132       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7133                  Operand(p1), Operand(p2), Operand(p3));
7134       emit_split_vector(ctx, dst, 3);
7135       break;
7136    }
7137    case nir_intrinsic_load_barycentric_at_sample: {
7138       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7139       switch (ctx->options->key.fs.num_samples) {
7140          case 2: sample_pos_offset += 1 << 3; break;
7141          case 4: sample_pos_offset += 3 << 3; break;
7142          case 8: sample_pos_offset += 7 << 3; break;
7143          default: break;
7144       }
7145       Temp sample_pos;
7146       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7147       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7148       Temp private_segment_buffer = ctx->program->private_segment_buffer;
7149       if (addr.type() == RegType::sgpr) {
7150          Operand offset;
7151          if (const_addr) {
7152             sample_pos_offset += const_addr->u32 << 3;
7153             offset = Operand(sample_pos_offset);
7154          } else if (ctx->options->chip_class >= GFX9) {
7155             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7156          } else {
7157             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
7158             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7159          }
7160
7161          Operand off = bld.copy(bld.def(s1), Operand(offset));
7162          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7163
7164       } else if (ctx->options->chip_class >= GFX9) {
7165          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7166          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
7167       } else if (ctx->options->chip_class >= GFX7) {
7168          /* addr += private_segment_buffer + sample_pos_offset */
7169          Temp tmp0 = bld.tmp(s1);
7170          Temp tmp1 = bld.tmp(s1);
7171          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
7172          Definition scc_tmp = bld.def(s1, scc);
7173          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
7174          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
7175          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7176          Temp pck0 = bld.tmp(v1);
7177          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7178          tmp1 = as_vgpr(ctx, tmp1);
7179          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
7180          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7181
7182          /* sample_pos = flat_load_dwordx2 addr */
7183          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
7184       } else {
7185          assert(ctx->options->chip_class == GFX6);
7186
7187          uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7188                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7189          Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
7190
7191          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7192          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
7193
7194          sample_pos = bld.tmp(v2);
7195
7196          aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
7197          load->definitions[0] = Definition(sample_pos);
7198          load->operands[0] = Operand(rsrc);
7199          load->operands[1] = Operand(addr);
7200          load->operands[2] = Operand(0u);
7201          load->offset = sample_pos_offset;
7202          load->offen = 0;
7203          load->addr64 = true;
7204          load->glc = false;
7205          load->dlc = false;
7206          load->disable_wqm = false;
7207          load->barrier = barrier_none;
7208          load->can_reorder = true;
7209          ctx->block->instructions.emplace_back(std::move(load));
7210       }
7211
7212       /* sample_pos -= 0.5 */
7213       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
7214       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
7215       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
7216       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
7217       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
7218
7219       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7220       break;
7221    }
7222    case nir_intrinsic_load_barycentric_at_offset: {
7223       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
7224       RegClass rc = RegClass(offset.type(), 1);
7225       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
7226       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
7227       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7228       break;
7229    }
7230    case nir_intrinsic_load_front_face: {
7231       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7232                Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
7233       break;
7234    }
7235    case nir_intrinsic_load_view_index: {
7236       if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
7237          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7238          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
7239          break;
7240       }
7241
7242       /* fallthrough */
7243    }
7244    case nir_intrinsic_load_layer_id: {
7245       unsigned idx = nir_intrinsic_base(instr);
7246       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7247                  Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
7248       break;
7249    }
7250    case nir_intrinsic_load_frag_coord: {
7251       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
7252       break;
7253    }
7254    case nir_intrinsic_load_sample_pos: {
7255       Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
7256       Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
7257       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7258                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
7259                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
7260       break;
7261    }
7262    case nir_intrinsic_load_tess_coord:
7263       visit_load_tess_coord(ctx, instr);
7264       break;
7265    case nir_intrinsic_load_interpolated_input:
7266       visit_load_interpolated_input(ctx, instr);
7267       break;
7268    case nir_intrinsic_store_output:
7269       visit_store_output(ctx, instr);
7270       break;
7271    case nir_intrinsic_load_input:
7272    case nir_intrinsic_load_input_vertex:
7273       visit_load_input(ctx, instr);
7274       break;
7275    case nir_intrinsic_load_output:
7276       visit_load_output(ctx, instr);
7277       break;
7278    case nir_intrinsic_load_per_vertex_input:
7279       visit_load_per_vertex_input(ctx, instr);
7280       break;
7281    case nir_intrinsic_load_per_vertex_output:
7282       visit_load_per_vertex_output(ctx, instr);
7283       break;
7284    case nir_intrinsic_store_per_vertex_output:
7285       visit_store_per_vertex_output(ctx, instr);
7286       break;
7287    case nir_intrinsic_load_ubo:
7288       visit_load_ubo(ctx, instr);
7289       break;
7290    case nir_intrinsic_load_push_constant:
7291       visit_load_push_constant(ctx, instr);
7292       break;
7293    case nir_intrinsic_load_constant:
7294       visit_load_constant(ctx, instr);
7295       break;
7296    case nir_intrinsic_vulkan_resource_index:
7297       visit_load_resource(ctx, instr);
7298       break;
7299    case nir_intrinsic_discard:
7300       visit_discard(ctx, instr);
7301       break;
7302    case nir_intrinsic_discard_if:
7303       visit_discard_if(ctx, instr);
7304       break;
7305    case nir_intrinsic_load_shared:
7306       visit_load_shared(ctx, instr);
7307       break;
7308    case nir_intrinsic_store_shared:
7309       visit_store_shared(ctx, instr);
7310       break;
7311    case nir_intrinsic_shared_atomic_add:
7312    case nir_intrinsic_shared_atomic_imin:
7313    case nir_intrinsic_shared_atomic_umin:
7314    case nir_intrinsic_shared_atomic_imax:
7315    case nir_intrinsic_shared_atomic_umax:
7316    case nir_intrinsic_shared_atomic_and:
7317    case nir_intrinsic_shared_atomic_or:
7318    case nir_intrinsic_shared_atomic_xor:
7319    case nir_intrinsic_shared_atomic_exchange:
7320    case nir_intrinsic_shared_atomic_comp_swap:
7321       visit_shared_atomic(ctx, instr);
7322       break;
7323    case nir_intrinsic_image_deref_load:
7324       visit_image_load(ctx, instr);
7325       break;
7326    case nir_intrinsic_image_deref_store:
7327       visit_image_store(ctx, instr);
7328       break;
7329    case nir_intrinsic_image_deref_atomic_add:
7330    case nir_intrinsic_image_deref_atomic_umin:
7331    case nir_intrinsic_image_deref_atomic_imin:
7332    case nir_intrinsic_image_deref_atomic_umax:
7333    case nir_intrinsic_image_deref_atomic_imax:
7334    case nir_intrinsic_image_deref_atomic_and:
7335    case nir_intrinsic_image_deref_atomic_or:
7336    case nir_intrinsic_image_deref_atomic_xor:
7337    case nir_intrinsic_image_deref_atomic_exchange:
7338    case nir_intrinsic_image_deref_atomic_comp_swap:
7339       visit_image_atomic(ctx, instr);
7340       break;
7341    case nir_intrinsic_image_deref_size:
7342       visit_image_size(ctx, instr);
7343       break;
7344    case nir_intrinsic_load_ssbo:
7345       visit_load_ssbo(ctx, instr);
7346       break;
7347    case nir_intrinsic_store_ssbo:
7348       visit_store_ssbo(ctx, instr);
7349       break;
7350    case nir_intrinsic_load_global:
7351       visit_load_global(ctx, instr);
7352       break;
7353    case nir_intrinsic_store_global:
7354       visit_store_global(ctx, instr);
7355       break;
7356    case nir_intrinsic_global_atomic_add:
7357    case nir_intrinsic_global_atomic_imin:
7358    case nir_intrinsic_global_atomic_umin:
7359    case nir_intrinsic_global_atomic_imax:
7360    case nir_intrinsic_global_atomic_umax:
7361    case nir_intrinsic_global_atomic_and:
7362    case nir_intrinsic_global_atomic_or:
7363    case nir_intrinsic_global_atomic_xor:
7364    case nir_intrinsic_global_atomic_exchange:
7365    case nir_intrinsic_global_atomic_comp_swap:
7366       visit_global_atomic(ctx, instr);
7367       break;
7368    case nir_intrinsic_ssbo_atomic_add:
7369    case nir_intrinsic_ssbo_atomic_imin:
7370    case nir_intrinsic_ssbo_atomic_umin:
7371    case nir_intrinsic_ssbo_atomic_imax:
7372    case nir_intrinsic_ssbo_atomic_umax:
7373    case nir_intrinsic_ssbo_atomic_and:
7374    case nir_intrinsic_ssbo_atomic_or:
7375    case nir_intrinsic_ssbo_atomic_xor:
7376    case nir_intrinsic_ssbo_atomic_exchange:
7377    case nir_intrinsic_ssbo_atomic_comp_swap:
7378       visit_atomic_ssbo(ctx, instr);
7379       break;
7380    case nir_intrinsic_load_scratch:
7381       visit_load_scratch(ctx, instr);
7382       break;
7383    case nir_intrinsic_store_scratch:
7384       visit_store_scratch(ctx, instr);
7385       break;
7386    case nir_intrinsic_get_buffer_size:
7387       visit_get_buffer_size(ctx, instr);
7388       break;
7389    case nir_intrinsic_control_barrier: {
7390       if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
7391          /* GFX6 only (thanks to a hw bug workaround):
7392           * The real barrier instruction isn’t needed, because an entire patch
7393           * always fits into a single wave.
7394           */
7395          break;
7396       }
7397
7398       if (ctx->program->workgroup_size > ctx->program->wave_size)
7399          bld.sopp(aco_opcode::s_barrier);
7400
7401       break;
7402    }
7403    case nir_intrinsic_memory_barrier_tcs_patch:
7404    case nir_intrinsic_group_memory_barrier:
7405    case nir_intrinsic_memory_barrier:
7406    case nir_intrinsic_memory_barrier_buffer:
7407    case nir_intrinsic_memory_barrier_image:
7408    case nir_intrinsic_memory_barrier_shared:
7409       emit_memory_barrier(ctx, instr);
7410       break;
7411    case nir_intrinsic_load_num_work_groups: {
7412       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7413       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
7414       emit_split_vector(ctx, dst, 3);
7415       break;
7416    }
7417    case nir_intrinsic_load_local_invocation_id: {
7418       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7419       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
7420       emit_split_vector(ctx, dst, 3);
7421       break;
7422    }
7423    case nir_intrinsic_load_work_group_id: {
7424       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7425       struct ac_arg *args = ctx->args->ac.workgroup_ids;
7426       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7427                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
7428                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
7429                  args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
7430       emit_split_vector(ctx, dst, 3);
7431       break;
7432    }
7433    case nir_intrinsic_load_local_invocation_index: {
7434       Temp id = emit_mbcnt(ctx, bld.def(v1));
7435
7436       /* The tg_size bits [6:11] contain the subgroup id,
7437        * we need this multiplied by the wave size, and then OR the thread id to it.
7438        */
7439       if (ctx->program->wave_size == 64) {
7440          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
7441          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
7442                                 get_arg(ctx, ctx->args->ac.tg_size));
7443          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
7444       } else {
7445          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR  */
7446          Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7447                                 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7448          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
7449       }
7450       break;
7451    }
7452    case nir_intrinsic_load_subgroup_id: {
7453       if (ctx->stage == compute_cs) {
7454          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
7455                   get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7456       } else {
7457          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
7458       }
7459       break;
7460    }
7461    case nir_intrinsic_load_subgroup_invocation: {
7462       emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
7463       break;
7464    }
7465    case nir_intrinsic_load_num_subgroups: {
7466       if (ctx->stage == compute_cs)
7467          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
7468                   get_arg(ctx, ctx->args->ac.tg_size));
7469       else
7470          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
7471       break;
7472    }
7473    case nir_intrinsic_ballot: {
7474       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7475       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7476       Definition tmp = bld.def(dst.regClass());
7477       Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass());
7478       if (instr->src[0].ssa->bit_size == 1) {
7479          assert(src.regClass() == bld.lm);
7480          bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
7481       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
7482          bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
7483       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
7484          bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
7485       } else {
7486          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7487          nir_print_instr(&instr->instr, stderr);
7488          fprintf(stderr, "\n");
7489       }
7490       if (dst.size() != bld.lm.size()) {
7491          /* Wave32 with ballot size set to 64 */
7492          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
7493       }
7494       emit_wqm(ctx, tmp.getTemp(), dst);
7495       break;
7496    }
7497    case nir_intrinsic_shuffle:
7498    case nir_intrinsic_read_invocation: {
7499       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7500       if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
7501          emit_uniform_subgroup(ctx, instr, src);
7502       } else {
7503          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
7504          if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
7505             tid = bld.as_uniform(tid);
7506          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7507          if (src.regClass() == v1) {
7508             emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
7509          } else if (src.regClass() == v2) {
7510             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7511             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7512             lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
7513             hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
7514             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7515             emit_split_vector(ctx, dst, 2);
7516          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
7517             assert(src.regClass() == bld.lm);
7518             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
7519             bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7520          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
7521             assert(src.regClass() == bld.lm);
7522             Temp tmp;
7523             if (ctx->program->chip_class <= GFX7)
7524                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
7525             else if (ctx->program->wave_size == 64)
7526                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
7527             else
7528                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
7529             tmp = emit_extract_vector(ctx, tmp, 0, v1);
7530             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
7531             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
7532          } else {
7533             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7534             nir_print_instr(&instr->instr, stderr);
7535             fprintf(stderr, "\n");
7536          }
7537       }
7538       break;
7539    }
7540    case nir_intrinsic_load_sample_id: {
7541       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7542                get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
7543       break;
7544    }
7545    case nir_intrinsic_load_sample_mask_in: {
7546       visit_load_sample_mask_in(ctx, instr);
7547       break;
7548    }
7549    case nir_intrinsic_read_first_invocation: {
7550       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7551       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7552       if (src.regClass() == v1) {
7553          emit_wqm(ctx,
7554                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
7555                   dst);
7556       } else if (src.regClass() == v2) {
7557          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7558          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7559          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
7560          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
7561          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7562          emit_split_vector(ctx, dst, 2);
7563       } else if (instr->dest.ssa.bit_size == 1) {
7564          assert(src.regClass() == bld.lm);
7565          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
7566                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
7567          bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7568       } else if (src.regClass() == s1) {
7569          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
7570       } else if (src.regClass() == s2) {
7571          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
7572       } else {
7573          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7574          nir_print_instr(&instr->instr, stderr);
7575          fprintf(stderr, "\n");
7576       }
7577       break;
7578    }
7579    case nir_intrinsic_vote_all: {
7580       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7581       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7582       assert(src.regClass() == bld.lm);
7583       assert(dst.regClass() == bld.lm);
7584
7585       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7586       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7587       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
7588       break;
7589    }
7590    case nir_intrinsic_vote_any: {
7591       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7592       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7593       assert(src.regClass() == bld.lm);
7594       assert(dst.regClass() == bld.lm);
7595
7596       Temp tmp = bool_to_scalar_condition(ctx, src);
7597       bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7598       break;
7599    }
7600    case nir_intrinsic_reduce:
7601    case nir_intrinsic_inclusive_scan:
7602    case nir_intrinsic_exclusive_scan: {
7603       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7604       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7605       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
7606       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
7607          nir_intrinsic_cluster_size(instr) : 0;
7608       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
7609
7610       if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
7611          emit_uniform_subgroup(ctx, instr, src);
7612       } else if (instr->dest.ssa.bit_size == 1) {
7613          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
7614             op = nir_op_iand;
7615          else if (op == nir_op_iadd)
7616             op = nir_op_ixor;
7617          else if (op == nir_op_umax || op == nir_op_imax)
7618             op = nir_op_ior;
7619          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
7620
7621          switch (instr->intrinsic) {
7622          case nir_intrinsic_reduce:
7623             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
7624             break;
7625          case nir_intrinsic_exclusive_scan:
7626             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
7627             break;
7628          case nir_intrinsic_inclusive_scan:
7629             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
7630             break;
7631          default:
7632             assert(false);
7633          }
7634       } else if (cluster_size == 1) {
7635          bld.copy(Definition(dst), src);
7636       } else {
7637          src = as_vgpr(ctx, src);
7638
7639          ReduceOp reduce_op;
7640          switch (op) {
7641          #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
7642             CASE(iadd)
7643             CASE(imul)
7644             CASE(fadd)
7645             CASE(fmul)
7646             CASE(imin)
7647             CASE(umin)
7648             CASE(fmin)
7649             CASE(imax)
7650             CASE(umax)
7651             CASE(fmax)
7652             CASE(iand)
7653             CASE(ior)
7654             CASE(ixor)
7655             default:
7656                unreachable("unknown reduction op");
7657          #undef CASE
7658          }
7659
7660          aco_opcode aco_op;
7661          switch (instr->intrinsic) {
7662             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
7663             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
7664             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
7665             default:
7666                unreachable("unknown reduce intrinsic");
7667          }
7668
7669          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
7670          reduce->operands[0] = Operand(src);
7671          // filled in by aco_reduce_assign.cpp, used internally as part of the
7672          // reduce sequence
7673          assert(dst.size() == 1 || dst.size() == 2);
7674          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7675          reduce->operands[2] = Operand(v1.as_linear());
7676
7677          Temp tmp_dst = bld.tmp(dst.regClass());
7678          reduce->definitions[0] = Definition(tmp_dst);
7679          reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
7680          reduce->definitions[2] = Definition();
7681          reduce->definitions[3] = Definition(scc, s1);
7682          reduce->definitions[4] = Definition();
7683          reduce->reduce_op = reduce_op;
7684          reduce->cluster_size = cluster_size;
7685          ctx->block->instructions.emplace_back(std::move(reduce));
7686
7687          emit_wqm(ctx, tmp_dst, dst);
7688       }
7689       break;
7690    }
7691    case nir_intrinsic_quad_broadcast: {
7692       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7693       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
7694          emit_uniform_subgroup(ctx, instr, src);
7695       } else {
7696          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7697          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
7698          uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
7699
7700          if (instr->dest.ssa.bit_size == 1) {
7701             assert(src.regClass() == bld.lm);
7702             assert(dst.regClass() == bld.lm);
7703             uint32_t half_mask = 0x11111111u << lane;
7704             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
7705             Temp tmp = bld.tmp(bld.lm);
7706             bld.sop1(Builder::s_wqm, Definition(tmp),
7707                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
7708                               bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
7709             emit_wqm(ctx, tmp, dst);
7710          } else if (instr->dest.ssa.bit_size == 32) {
7711             if (ctx->program->chip_class >= GFX8)
7712                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
7713             else
7714                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
7715          } else if (instr->dest.ssa.bit_size == 64) {
7716             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7717             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7718             if (ctx->program->chip_class >= GFX8) {
7719                lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7720                hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7721             } else {
7722                lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
7723                hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
7724             }
7725             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7726             emit_split_vector(ctx, dst, 2);
7727          } else {
7728             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7729             nir_print_instr(&instr->instr, stderr);
7730             fprintf(stderr, "\n");
7731          }
7732       }
7733       break;
7734    }
7735    case nir_intrinsic_quad_swap_horizontal:
7736    case nir_intrinsic_quad_swap_vertical:
7737    case nir_intrinsic_quad_swap_diagonal:
7738    case nir_intrinsic_quad_swizzle_amd: {
7739       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7740       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
7741          emit_uniform_subgroup(ctx, instr, src);
7742          break;
7743       }
7744       uint16_t dpp_ctrl = 0;
7745       switch (instr->intrinsic) {
7746       case nir_intrinsic_quad_swap_horizontal:
7747          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
7748          break;
7749       case nir_intrinsic_quad_swap_vertical:
7750          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
7751          break;
7752       case nir_intrinsic_quad_swap_diagonal:
7753          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
7754          break;
7755       case nir_intrinsic_quad_swizzle_amd:
7756          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
7757          break;
7758       default:
7759          break;
7760       }
7761       if (ctx->program->chip_class < GFX8)
7762          dpp_ctrl |= (1 << 15);
7763
7764       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7765       if (instr->dest.ssa.bit_size == 1) {
7766          assert(src.regClass() == bld.lm);
7767          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7768          if (ctx->program->chip_class >= GFX8)
7769             src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7770          else
7771             src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7772          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7773          emit_wqm(ctx, tmp, dst);
7774       } else if (instr->dest.ssa.bit_size == 32) {
7775          Temp tmp;
7776          if (ctx->program->chip_class >= GFX8)
7777             tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7778          else
7779             tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7780          emit_wqm(ctx, tmp, dst);
7781       } else if (instr->dest.ssa.bit_size == 64) {
7782          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7783          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7784          if (ctx->program->chip_class >= GFX8) {
7785             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7786             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7787          } else {
7788             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
7789             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
7790          }
7791          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7792          emit_split_vector(ctx, dst, 2);
7793       } else {
7794          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7795          nir_print_instr(&instr->instr, stderr);
7796          fprintf(stderr, "\n");
7797       }
7798       break;
7799    }
7800    case nir_intrinsic_masked_swizzle_amd: {
7801       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7802       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
7803          emit_uniform_subgroup(ctx, instr, src);
7804          break;
7805       }
7806       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7807       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
7808       if (dst.regClass() == v1) {
7809          emit_wqm(ctx,
7810                   bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
7811                   dst);
7812       } else if (dst.regClass() == v2) {
7813          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7814          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7815          lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
7816          hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
7817          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7818          emit_split_vector(ctx, dst, 2);
7819       } else {
7820          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7821          nir_print_instr(&instr->instr, stderr);
7822          fprintf(stderr, "\n");
7823       }
7824       break;
7825    }
7826    case nir_intrinsic_write_invocation_amd: {
7827       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7828       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7829       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
7830       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7831       if (dst.regClass() == v1) {
7832          /* src2 is ignored for writelane. RA assigns the same reg for dst */
7833          emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
7834       } else if (dst.regClass() == v2) {
7835          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
7836          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
7837          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
7838          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
7839          Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
7840          Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
7841          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7842          emit_split_vector(ctx, dst, 2);
7843       } else {
7844          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7845          nir_print_instr(&instr->instr, stderr);
7846          fprintf(stderr, "\n");
7847       }
7848       break;
7849    }
7850    case nir_intrinsic_mbcnt_amd: {
7851       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7852       RegClass rc = RegClass(src.type(), 1);
7853       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
7854       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
7855       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7856       Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi));
7857       emit_wqm(ctx, wqm_tmp, dst);
7858       break;
7859    }
7860    case nir_intrinsic_load_helper_invocation: {
7861       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7862       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
7863       ctx->block->kind |= block_kind_needs_lowering;
7864       ctx->program->needs_exact = true;
7865       break;
7866    }
7867    case nir_intrinsic_is_helper_invocation: {
7868       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7869       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
7870       ctx->block->kind |= block_kind_needs_lowering;
7871       ctx->program->needs_exact = true;
7872       break;
7873    }
7874    case nir_intrinsic_demote:
7875       bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u));
7876
7877       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
7878          ctx->cf_info.exec_potentially_empty_discard = true;
7879       ctx->block->kind |= block_kind_uses_demote;
7880       ctx->program->needs_exact = true;
7881       break;
7882    case nir_intrinsic_demote_if: {
7883       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7884       assert(src.regClass() == bld.lm);
7885       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7886       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
7887
7888       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
7889          ctx->cf_info.exec_potentially_empty_discard = true;
7890       ctx->block->kind |= block_kind_uses_demote;
7891       ctx->program->needs_exact = true;
7892       break;
7893    }
7894    case nir_intrinsic_first_invocation: {
7895       emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
7896                get_ssa_temp(ctx, &instr->dest.ssa));
7897       break;
7898    }
7899    case nir_intrinsic_shader_clock:
7900       bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
7901       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
7902       break;
7903    case nir_intrinsic_load_vertex_id_zero_base: {
7904       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7905       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
7906       break;
7907    }
7908    case nir_intrinsic_load_first_vertex: {
7909       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7910       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
7911       break;
7912    }
7913    case nir_intrinsic_load_base_instance: {
7914       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7915       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
7916       break;
7917    }
7918    case nir_intrinsic_load_instance_id: {
7919       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7920       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
7921       break;
7922    }
7923    case nir_intrinsic_load_draw_id: {
7924       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7925       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
7926       break;
7927    }
7928    case nir_intrinsic_load_invocation_id: {
7929       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7930
7931       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
7932          if (ctx->options->chip_class >= GFX10)
7933             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
7934          else
7935             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
7936       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
7937          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
7938                   get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
7939       } else {
7940          unreachable("Unsupported stage for load_invocation_id");
7941       }
7942
7943       break;
7944    }
7945    case nir_intrinsic_load_primitive_id: {
7946       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7947
7948       switch (ctx->shader->info.stage) {
7949       case MESA_SHADER_GEOMETRY:
7950          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
7951          break;
7952       case MESA_SHADER_TESS_CTRL:
7953          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
7954          break;
7955       case MESA_SHADER_TESS_EVAL:
7956          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
7957          break;
7958       default:
7959          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
7960       }
7961
7962       break;
7963    }
7964    case nir_intrinsic_load_patch_vertices_in: {
7965       assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
7966              ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
7967
7968       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7969       bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices));
7970       break;
7971    }
7972    case nir_intrinsic_emit_vertex_with_counter: {
7973       visit_emit_vertex_with_counter(ctx, instr);
7974       break;
7975    }
7976    case nir_intrinsic_end_primitive_with_counter: {
7977       unsigned stream = nir_intrinsic_stream_id(instr);
7978       bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
7979       break;
7980    }
7981    case nir_intrinsic_set_vertex_count: {
7982       /* unused, the HW keeps track of this for us */
7983       break;
7984    }
7985    default:
7986       fprintf(stderr, "Unimplemented intrinsic instr: ");
7987       nir_print_instr(&instr->instr, stderr);
7988       fprintf(stderr, "\n");
7989       abort();
7990
7991       break;
7992    }
7993 }
7994
7995
7996 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
7997                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
7998                     enum glsl_base_type *stype)
7999 {
8000    nir_deref_instr *texture_deref_instr = NULL;
8001    nir_deref_instr *sampler_deref_instr = NULL;
8002    int plane = -1;
8003
8004    for (unsigned i = 0; i < instr->num_srcs; i++) {
8005       switch (instr->src[i].src_type) {
8006       case nir_tex_src_texture_deref:
8007          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
8008          break;
8009       case nir_tex_src_sampler_deref:
8010          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
8011          break;
8012       case nir_tex_src_plane:
8013          plane = nir_src_as_int(instr->src[i].src);
8014          break;
8015       default:
8016          break;
8017       }
8018    }
8019
8020    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
8021
8022    if (!sampler_deref_instr)
8023       sampler_deref_instr = texture_deref_instr;
8024
8025    if (plane >= 0) {
8026       assert(instr->op != nir_texop_txf_ms &&
8027              instr->op != nir_texop_samples_identical);
8028       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
8029       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
8030    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
8031       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
8032    } else if (instr->op == nir_texop_fragment_mask_fetch) {
8033       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8034    } else {
8035       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
8036    }
8037    if (samp_ptr) {
8038       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
8039
8040       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
8041          /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
8042          Builder bld(ctx->program, ctx->block);
8043
8044          /* to avoid unnecessary moves, we split and recombine sampler and image */
8045          Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
8046                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8047          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8048          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
8049                     Definition(img[2]), Definition(img[3]), Definition(img[4]),
8050                     Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
8051          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
8052                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
8053
8054          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
8055          *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
8056                                img[0], img[1], img[2], img[3],
8057                                img[4], img[5], img[6], img[7]);
8058          *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
8059                                 samp[0], samp[1], samp[2], samp[3]);
8060       }
8061    }
8062    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
8063                      instr->op == nir_texop_samples_identical))
8064       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8065 }
8066
8067 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
8068                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
8069 {
8070    Builder bld(ctx->program, ctx->block);
8071
8072    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
8073    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
8074    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
8075
8076    Operand neg_one(0xbf800000u);
8077    Operand one(0x3f800000u);
8078    Operand two(0x40000000u);
8079    Operand four(0x40800000u);
8080
8081    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
8082    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
8083    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
8084
8085    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
8086    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
8087    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
8088    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
8089
8090    // select sc
8091    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
8092    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
8093                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
8094                        one, is_ma_y);
8095    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8096
8097    // select tc
8098    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
8099    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
8100    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8101
8102    // select ma
8103    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8104                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
8105                   deriv_z, is_ma_z);
8106    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
8107    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
8108 }
8109
8110 void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
8111 {
8112    Builder bld(ctx->program, ctx->block);
8113    Temp ma, tc, sc, id;
8114
8115    if (is_array) {
8116       coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
8117
8118       // see comment in ac_prepare_cube_coords()
8119       if (ctx->options->chip_class <= GFX8)
8120          coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
8121    }
8122
8123    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8124
8125    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
8126    vop3a->operands[0] = Operand(ma);
8127    vop3a->abs[0] = true;
8128    Temp invma = bld.tmp(v1);
8129    vop3a->definitions[0] = Definition(invma);
8130    ctx->block->instructions.emplace_back(std::move(vop3a));
8131
8132    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8133    if (!is_deriv)
8134       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
8135
8136    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8137    if (!is_deriv)
8138       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
8139
8140    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8141
8142    if (is_deriv) {
8143       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
8144       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
8145
8146       for (unsigned i = 0; i < 2; i++) {
8147          // see comment in ac_prepare_cube_coords()
8148          Temp deriv_ma;
8149          Temp deriv_sc, deriv_tc;
8150          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
8151                            &deriv_ma, &deriv_sc, &deriv_tc);
8152
8153          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
8154
8155          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8156                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
8157                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
8158          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8159                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
8160                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
8161          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
8162       }
8163
8164       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
8165       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
8166    }
8167
8168    if (is_array)
8169       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
8170    coords.resize(3);
8171    coords[0] = sc;
8172    coords[1] = tc;
8173    coords[2] = id;
8174 }
8175
8176 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
8177 {
8178    if (vec->parent_instr->type != nir_instr_type_alu)
8179       return;
8180    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
8181    if (vec_instr->op != nir_op_vec(vec->num_components))
8182       return;
8183
8184    for (unsigned i = 0; i < vec->num_components; i++) {
8185       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
8186               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
8187    }
8188 }
8189
8190 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
8191 {
8192    Builder bld(ctx->program, ctx->block);
8193    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
8194         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
8195    Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
8196         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
8197    std::vector<Temp> coords;
8198    std::vector<Temp> derivs;
8199    nir_const_value *sample_index_cv = NULL;
8200    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
8201    enum glsl_base_type stype;
8202    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
8203
8204    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
8205                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
8206    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
8207                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
8208
8209    for (unsigned i = 0; i < instr->num_srcs; i++) {
8210       switch (instr->src[i].src_type) {
8211       case nir_tex_src_coord: {
8212          Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
8213          for (unsigned i = 0; i < coord.size(); i++)
8214             coords.emplace_back(emit_extract_vector(ctx, coord, i, v1));
8215          break;
8216       }
8217       case nir_tex_src_bias:
8218          if (instr->op == nir_texop_txb) {
8219             bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
8220             has_bias = true;
8221          }
8222          break;
8223       case nir_tex_src_lod: {
8224          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
8225
8226          if (val && val->f32 <= 0.0) {
8227             level_zero = true;
8228          } else {
8229             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8230             has_lod = true;
8231          }
8232          break;
8233       }
8234       case nir_tex_src_comparator:
8235          if (instr->is_shadow) {
8236             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
8237             has_compare = true;
8238          }
8239          break;
8240       case nir_tex_src_offset:
8241          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
8242          get_const_vec(instr->src[i].src.ssa, const_offset);
8243          has_offset = true;
8244          break;
8245       case nir_tex_src_ddx:
8246          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
8247          has_ddx = true;
8248          break;
8249       case nir_tex_src_ddy:
8250          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
8251          has_ddy = true;
8252          break;
8253       case nir_tex_src_ms_index:
8254          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
8255          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
8256          has_sample_index = true;
8257          break;
8258       case nir_tex_src_texture_offset:
8259       case nir_tex_src_sampler_offset:
8260       default:
8261          break;
8262       }
8263    }
8264
8265    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
8266       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
8267
8268    if (instr->op == nir_texop_texture_samples) {
8269       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
8270
8271       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
8272       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
8273       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
8274       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
8275
8276       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8277                samples, Operand(1u), bld.scc(is_msaa));
8278       return;
8279    }
8280
8281    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
8282       aco_ptr<Instruction> tmp_instr;
8283       Temp acc, pack = Temp();
8284
8285       uint32_t pack_const = 0;
8286       for (unsigned i = 0; i < offset.size(); i++) {
8287          if (!const_offset[i])
8288             continue;
8289          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
8290       }
8291
8292       if (offset.type() == RegType::sgpr) {
8293          for (unsigned i = 0; i < offset.size(); i++) {
8294             if (const_offset[i])
8295                continue;
8296
8297             acc = emit_extract_vector(ctx, offset, i, s1);
8298             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
8299
8300             if (i) {
8301                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
8302             }
8303
8304             if (pack == Temp()) {
8305                pack = acc;
8306             } else {
8307                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
8308             }
8309          }
8310
8311          if (pack_const && pack != Temp())
8312             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
8313       } else {
8314          for (unsigned i = 0; i < offset.size(); i++) {
8315             if (const_offset[i])
8316                continue;
8317
8318             acc = emit_extract_vector(ctx, offset, i, v1);
8319             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
8320
8321             if (i) {
8322                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
8323             }
8324
8325             if (pack == Temp()) {
8326                pack = acc;
8327             } else {
8328                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
8329             }
8330          }
8331
8332          if (pack_const && pack != Temp())
8333             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
8334       }
8335       if (pack_const && pack == Temp())
8336          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
8337       else if (pack == Temp())
8338          has_offset = false;
8339       else
8340          offset = pack;
8341    }
8342
8343    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
8344       prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
8345
8346    /* pack derivatives */
8347    if (has_ddx || has_ddy) {
8348       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
8349          assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
8350          Temp zero = bld.copy(bld.def(v1), Operand(0u));
8351          derivs = {ddy, zero, ddy, zero};
8352       } else {
8353          for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
8354             derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
8355          for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
8356             derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
8357       }
8358       has_derivs = true;
8359    }
8360
8361    if (instr->coord_components > 1 &&
8362        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8363        instr->is_array &&
8364        instr->op != nir_texop_txf)
8365       coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
8366
8367    if (instr->coord_components > 2 &&
8368       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
8369        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8370        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
8371        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8372        instr->is_array &&
8373        instr->op != nir_texop_txf &&
8374        instr->op != nir_texop_txf_ms &&
8375        instr->op != nir_texop_fragment_fetch &&
8376        instr->op != nir_texop_fragment_mask_fetch)
8377       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
8378
8379    if (ctx->options->chip_class == GFX9 &&
8380        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8381        instr->op != nir_texop_lod && instr->coord_components) {
8382       assert(coords.size() > 0 && coords.size() < 3);
8383
8384       coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
8385                                                                      Operand((uint32_t) 0) :
8386                                                                      Operand((uint32_t) 0x3f000000)));
8387    }
8388
8389    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
8390
8391    if (instr->op == nir_texop_samples_identical)
8392       resource = fmask_ptr;
8393
8394    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8395              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8396             instr->op != nir_texop_txs &&
8397             instr->op != nir_texop_fragment_fetch &&
8398             instr->op != nir_texop_fragment_mask_fetch) {
8399       assert(has_sample_index);
8400       Operand op(sample_index);
8401       if (sample_index_cv)
8402          op = Operand(sample_index_cv->u32);
8403       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
8404    }
8405
8406    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
8407       for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
8408          Temp off = emit_extract_vector(ctx, offset, i, v1);
8409          coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
8410       }
8411       has_offset = false;
8412    }
8413
8414    /* Build tex instruction */
8415    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
8416    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
8417                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
8418                   : 0;
8419    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8420    Temp tmp_dst = dst;
8421
8422    /* gather4 selects the component by dmask and always returns vec4 */
8423    if (instr->op == nir_texop_tg4) {
8424       assert(instr->dest.ssa.num_components == 4);
8425       if (instr->is_shadow)
8426          dmask = 1;
8427       else
8428          dmask = 1 << instr->component;
8429       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
8430          tmp_dst = bld.tmp(v4);
8431    } else if (instr->op == nir_texop_samples_identical) {
8432       tmp_dst = bld.tmp(v1);
8433    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
8434       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
8435    }
8436
8437    aco_ptr<MIMG_instruction> tex;
8438    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
8439       if (!has_lod)
8440          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8441
8442       bool div_by_6 = instr->op == nir_texop_txs &&
8443                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
8444                       instr->is_array &&
8445                       (dmask & (1 << 2));
8446       if (tmp_dst.id() == dst.id() && div_by_6)
8447          tmp_dst = bld.tmp(tmp_dst.regClass());
8448
8449       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8450       tex->operands[0] = Operand(resource);
8451       tex->operands[1] = Operand(s4); /* no sampler */
8452       tex->operands[2] = Operand(as_vgpr(ctx,lod));
8453       if (ctx->options->chip_class == GFX9 &&
8454           instr->op == nir_texop_txs &&
8455           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8456           instr->is_array) {
8457          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
8458       } else if (instr->op == nir_texop_query_levels) {
8459          tex->dmask = 1 << 3;
8460       } else {
8461          tex->dmask = dmask;
8462       }
8463       tex->da = da;
8464       tex->definitions[0] = Definition(tmp_dst);
8465       tex->dim = dim;
8466       tex->can_reorder = true;
8467       ctx->block->instructions.emplace_back(std::move(tex));
8468
8469       if (div_by_6) {
8470          /* divide 3rd value by 6 by multiplying with magic number */
8471          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8472          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
8473          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
8474          assert(instr->dest.ssa.num_components == 3);
8475          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
8476          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8477                               emit_extract_vector(ctx, tmp_dst, 0, v1),
8478                               emit_extract_vector(ctx, tmp_dst, 1, v1),
8479                               by_6);
8480
8481       }
8482
8483       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8484       return;
8485    }
8486
8487    Temp tg4_compare_cube_wa64 = Temp();
8488
8489    if (tg4_integer_workarounds) {
8490       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8491       tex->operands[0] = Operand(resource);
8492       tex->operands[1] = Operand(s4); /* no sampler */
8493       tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8494       tex->dim = dim;
8495       tex->dmask = 0x3;
8496       tex->da = da;
8497       Temp size = bld.tmp(v2);
8498       tex->definitions[0] = Definition(size);
8499       tex->can_reorder = true;
8500       ctx->block->instructions.emplace_back(std::move(tex));
8501       emit_split_vector(ctx, size, size.size());
8502
8503       Temp half_texel[2];
8504       for (unsigned i = 0; i < 2; i++) {
8505          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
8506          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
8507          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
8508          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
8509       }
8510
8511       Temp new_coords[2] = {
8512          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
8513          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
8514       };
8515
8516       if (tg4_integer_cube_workaround) {
8517          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
8518          Temp desc[resource.size()];
8519          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
8520                                                                            Format::PSEUDO, 1, resource.size())};
8521          split->operands[0] = Operand(resource);
8522          for (unsigned i = 0; i < resource.size(); i++) {
8523             desc[i] = bld.tmp(s1);
8524             split->definitions[i] = Definition(desc[i]);
8525          }
8526          ctx->block->instructions.emplace_back(std::move(split));
8527
8528          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
8529          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
8530                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
8531
8532          Temp nfmt;
8533          if (stype == GLSL_TYPE_UINT) {
8534             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8535                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
8536                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
8537                             bld.scc(compare_cube_wa));
8538          } else {
8539             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8540                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
8541                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
8542                             bld.scc(compare_cube_wa));
8543          }
8544          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
8545          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
8546
8547          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
8548
8549          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
8550                             Operand((uint32_t)C_008F14_NUM_FORMAT));
8551          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
8552
8553          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
8554                                                                          Format::PSEUDO, resource.size(), 1)};
8555          for (unsigned i = 0; i < resource.size(); i++)
8556             vec->operands[i] = Operand(desc[i]);
8557          resource = bld.tmp(resource.regClass());
8558          vec->definitions[0] = Definition(resource);
8559          ctx->block->instructions.emplace_back(std::move(vec));
8560
8561          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8562                                   new_coords[0], coords[0], tg4_compare_cube_wa64);
8563          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8564                                   new_coords[1], coords[1], tg4_compare_cube_wa64);
8565       }
8566       coords[0] = new_coords[0];
8567       coords[1] = new_coords[1];
8568    }
8569
8570    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
8571       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
8572
8573       assert(coords.size() == 1);
8574       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
8575       aco_opcode op;
8576       switch (last_bit) {
8577       case 1:
8578          op = aco_opcode::buffer_load_format_x; break;
8579       case 2:
8580          op = aco_opcode::buffer_load_format_xy; break;
8581       case 3:
8582          op = aco_opcode::buffer_load_format_xyz; break;
8583       case 4:
8584          op = aco_opcode::buffer_load_format_xyzw; break;
8585       default:
8586          unreachable("Tex instruction loads more than 4 components.");
8587       }
8588
8589       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
8590       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
8591          tmp_dst = dst;
8592       else
8593          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
8594
8595       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
8596       mubuf->operands[0] = Operand(resource);
8597       mubuf->operands[1] = Operand(coords[0]);
8598       mubuf->operands[2] = Operand((uint32_t) 0);
8599       mubuf->definitions[0] = Definition(tmp_dst);
8600       mubuf->idxen = true;
8601       mubuf->can_reorder = true;
8602       ctx->block->instructions.emplace_back(std::move(mubuf));
8603
8604       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
8605       return;
8606    }
8607
8608    /* gather MIMG address components */
8609    std::vector<Temp> args;
8610    if (has_offset)
8611       args.emplace_back(offset);
8612    if (has_bias)
8613       args.emplace_back(bias);
8614    if (has_compare)
8615       args.emplace_back(compare);
8616    if (has_derivs)
8617       args.insert(args.end(), derivs.begin(), derivs.end());
8618
8619    args.insert(args.end(), coords.begin(), coords.end());
8620    if (has_sample_index)
8621       args.emplace_back(sample_index);
8622    if (has_lod)
8623       args.emplace_back(lod);
8624
8625    Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
8626    aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
8627    vec->definitions[0] = Definition(arg);
8628    for (unsigned i = 0; i < args.size(); i++)
8629       vec->operands[i] = Operand(args[i]);
8630    ctx->block->instructions.emplace_back(std::move(vec));
8631
8632
8633    if (instr->op == nir_texop_txf ||
8634        instr->op == nir_texop_txf_ms ||
8635        instr->op == nir_texop_samples_identical ||
8636        instr->op == nir_texop_fragment_fetch ||
8637        instr->op == nir_texop_fragment_mask_fetch) {
8638       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
8639       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
8640       tex->operands[0] = Operand(resource);
8641       tex->operands[1] = Operand(s4); /* no sampler */
8642       tex->operands[2] = Operand(arg);
8643       tex->dim = dim;
8644       tex->dmask = dmask;
8645       tex->unrm = true;
8646       tex->da = da;
8647       tex->definitions[0] = Definition(tmp_dst);
8648       tex->can_reorder = true;
8649       ctx->block->instructions.emplace_back(std::move(tex));
8650
8651       if (instr->op == nir_texop_samples_identical) {
8652          assert(dmask == 1 && dst.regClass() == v1);
8653          assert(dst.id() != tmp_dst.id());
8654
8655          Temp tmp = bld.tmp(bld.lm);
8656          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
8657          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
8658
8659       } else {
8660          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8661       }
8662       return;
8663    }
8664
8665    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
8666    aco_opcode opcode = aco_opcode::image_sample;
8667    if (has_offset) { /* image_sample_*_o */
8668       if (has_compare) {
8669          opcode = aco_opcode::image_sample_c_o;
8670          if (has_derivs)
8671             opcode = aco_opcode::image_sample_c_d_o;
8672          if (has_bias)
8673             opcode = aco_opcode::image_sample_c_b_o;
8674          if (level_zero)
8675             opcode = aco_opcode::image_sample_c_lz_o;
8676          if (has_lod)
8677             opcode = aco_opcode::image_sample_c_l_o;
8678       } else {
8679          opcode = aco_opcode::image_sample_o;
8680          if (has_derivs)
8681             opcode = aco_opcode::image_sample_d_o;
8682          if (has_bias)
8683             opcode = aco_opcode::image_sample_b_o;
8684          if (level_zero)
8685             opcode = aco_opcode::image_sample_lz_o;
8686          if (has_lod)
8687             opcode = aco_opcode::image_sample_l_o;
8688       }
8689    } else { /* no offset */
8690       if (has_compare) {
8691          opcode = aco_opcode::image_sample_c;
8692          if (has_derivs)
8693             opcode = aco_opcode::image_sample_c_d;
8694          if (has_bias)
8695             opcode = aco_opcode::image_sample_c_b;
8696          if (level_zero)
8697             opcode = aco_opcode::image_sample_c_lz;
8698          if (has_lod)
8699             opcode = aco_opcode::image_sample_c_l;
8700       } else {
8701          opcode = aco_opcode::image_sample;
8702          if (has_derivs)
8703             opcode = aco_opcode::image_sample_d;
8704          if (has_bias)
8705             opcode = aco_opcode::image_sample_b;
8706          if (level_zero)
8707             opcode = aco_opcode::image_sample_lz;
8708          if (has_lod)
8709             opcode = aco_opcode::image_sample_l;
8710       }
8711    }
8712
8713    if (instr->op == nir_texop_tg4) {
8714       if (has_offset) {
8715          opcode = aco_opcode::image_gather4_lz_o;
8716          if (has_compare)
8717             opcode = aco_opcode::image_gather4_c_lz_o;
8718       } else {
8719          opcode = aco_opcode::image_gather4_lz;
8720          if (has_compare)
8721             opcode = aco_opcode::image_gather4_c_lz;
8722       }
8723    } else if (instr->op == nir_texop_lod) {
8724       opcode = aco_opcode::image_get_lod;
8725    }
8726
8727    /* we don't need the bias, sample index, compare value or offset to be
8728     * computed in WQM but if the p_create_vector copies the coordinates, then it
8729     * needs to be in WQM */
8730    if (ctx->stage == fragment_fs &&
8731        !has_derivs && !has_lod && !level_zero &&
8732        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
8733        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
8734       arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
8735
8736    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
8737    tex->operands[0] = Operand(resource);
8738    tex->operands[1] = Operand(sampler);
8739    tex->operands[2] = Operand(arg);
8740    tex->dim = dim;
8741    tex->dmask = dmask;
8742    tex->da = da;
8743    tex->definitions[0] = Definition(tmp_dst);
8744    tex->can_reorder = true;
8745    ctx->block->instructions.emplace_back(std::move(tex));
8746
8747    if (tg4_integer_cube_workaround) {
8748       assert(tmp_dst.id() != dst.id());
8749       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
8750
8751       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8752       Temp val[4];
8753       for (unsigned i = 0; i < dst.size(); i++) {
8754          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
8755          Temp cvt_val;
8756          if (stype == GLSL_TYPE_UINT)
8757             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
8758          else
8759             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
8760          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
8761       }
8762       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
8763       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8764                            val[0], val[1], val[2], val[3]);
8765    }
8766    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
8767    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
8768
8769 }
8770
8771
8772 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
8773 {
8774    Temp tmp = get_ssa_temp(ctx, ssa);
8775    if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
8776       return Operand(tmp.regClass());
8777    else
8778       return Operand(tmp);
8779 }
8780
8781 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
8782 {
8783    aco_ptr<Pseudo_instruction> phi;
8784    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8785    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
8786
8787    bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
8788    logical |= ctx->block->kind & block_kind_merge;
8789    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
8790
8791    /* we want a sorted list of sources, since the predecessor list is also sorted */
8792    std::map<unsigned, nir_ssa_def*> phi_src;
8793    nir_foreach_phi_src(src, instr)
8794       phi_src[src->pred->index] = src->src.ssa;
8795
8796    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
8797    unsigned num_operands = 0;
8798    Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1];
8799    unsigned num_defined = 0;
8800    unsigned cur_pred_idx = 0;
8801    for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
8802       if (cur_pred_idx < preds.size()) {
8803          /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
8804          unsigned block = ctx->cf_info.nir_to_aco[src.first];
8805          unsigned skipped = 0;
8806          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
8807             skipped++;
8808          if (cur_pred_idx + skipped < preds.size()) {
8809             for (unsigned i = 0; i < skipped; i++)
8810                operands[num_operands++] = Operand(dst.regClass());
8811             cur_pred_idx += skipped;
8812          } else {
8813             continue;
8814          }
8815       }
8816       /* Handle missing predecessors at the end. This shouldn't happen with loop
8817        * headers and we can't ignore these sources for loop header phis. */
8818       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
8819          continue;
8820       cur_pred_idx++;
8821       Operand op = get_phi_operand(ctx, src.second);
8822       operands[num_operands++] = op;
8823       num_defined += !op.isUndefined();
8824    }
8825    /* handle block_kind_continue_or_break at loop exit blocks */
8826    while (cur_pred_idx++ < preds.size())
8827       operands[num_operands++] = Operand(dst.regClass());
8828
8829    /* If the loop ends with a break, still add a linear continue edge in case
8830     * that break is divergent or continue_or_break is used. We'll either remove
8831     * this operand later in visit_loop() if it's not necessary or replace the
8832     * undef with something correct. */
8833    if (!logical && ctx->block->kind & block_kind_loop_header) {
8834       nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
8835       nir_block *last = nir_loop_last_block(loop);
8836       if (last->successors[0] != instr->instr.block)
8837          operands[num_operands++] = Operand(RegClass());
8838    }
8839
8840    if (num_defined == 0) {
8841       Builder bld(ctx->program, ctx->block);
8842       if (dst.regClass() == s1) {
8843          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
8844       } else if (dst.regClass() == v1) {
8845          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
8846       } else {
8847          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
8848          for (unsigned i = 0; i < dst.size(); i++)
8849             vec->operands[i] = Operand(0u);
8850          vec->definitions[0] = Definition(dst);
8851          ctx->block->instructions.emplace_back(std::move(vec));
8852       }
8853       return;
8854    }
8855
8856    /* we can use a linear phi in some cases if one src is undef */
8857    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
8858       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
8859
8860       Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
8861       Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
8862       assert(invert->kind & block_kind_invert);
8863
8864       unsigned then_block = invert->linear_preds[0];
8865
8866       Block* insert_block = NULL;
8867       for (unsigned i = 0; i < num_operands; i++) {
8868          Operand op = operands[i];
8869          if (op.isUndefined())
8870             continue;
8871          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
8872          phi->operands[0] = op;
8873          break;
8874       }
8875       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
8876       phi->operands[1] = Operand(dst.regClass());
8877       phi->definitions[0] = Definition(dst);
8878       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
8879       return;
8880    }
8881
8882    /* try to scalarize vector phis */
8883    if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
8884       // TODO: scalarize linear phis on divergent ifs
8885       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
8886       std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
8887       for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
8888          Operand src = operands[i];
8889          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
8890             can_scalarize = false;
8891       }
8892       if (can_scalarize) {
8893          unsigned num_components = instr->dest.ssa.num_components;
8894          assert(dst.size() % num_components == 0);
8895          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
8896
8897          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
8898          for (unsigned k = 0; k < num_components; k++) {
8899             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
8900             for (unsigned i = 0; i < num_operands; i++) {
8901                Operand src = operands[i];
8902                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
8903             }
8904             Temp phi_dst = {ctx->program->allocateId(), rc};
8905             phi->definitions[0] = Definition(phi_dst);
8906             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
8907             new_vec[k] = phi_dst;
8908             vec->operands[k] = Operand(phi_dst);
8909          }
8910          vec->definitions[0] = Definition(dst);
8911          ctx->block->instructions.emplace_back(std::move(vec));
8912          ctx->allocated_vec.emplace(dst.id(), new_vec);
8913          return;
8914       }
8915    }
8916
8917    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
8918    for (unsigned i = 0; i < num_operands; i++)
8919       phi->operands[i] = operands[i];
8920    phi->definitions[0] = Definition(dst);
8921    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
8922 }
8923
8924
8925 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
8926 {
8927    Temp dst = get_ssa_temp(ctx, &instr->def);
8928
8929    assert(dst.type() == RegType::sgpr);
8930
8931    if (dst.size() == 1) {
8932       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
8933    } else {
8934       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
8935       for (unsigned i = 0; i < dst.size(); i++)
8936          vec->operands[i] = Operand(0u);
8937       vec->definitions[0] = Definition(dst);
8938       ctx->block->instructions.emplace_back(std::move(vec));
8939    }
8940 }
8941
8942 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
8943 {
8944    Builder bld(ctx->program, ctx->block);
8945    Block *logical_target;
8946    append_logical_end(ctx->block);
8947    unsigned idx = ctx->block->index;
8948
8949    switch (instr->type) {
8950    case nir_jump_break:
8951       logical_target = ctx->cf_info.parent_loop.exit;
8952       add_logical_edge(idx, logical_target);
8953       ctx->block->kind |= block_kind_break;
8954
8955       if (!ctx->cf_info.parent_if.is_divergent &&
8956           !ctx->cf_info.parent_loop.has_divergent_continue) {
8957          /* uniform break - directly jump out of the loop */
8958          ctx->block->kind |= block_kind_uniform;
8959          ctx->cf_info.has_branch = true;
8960          bld.branch(aco_opcode::p_branch);
8961          add_linear_edge(idx, logical_target);
8962          return;
8963       }
8964       ctx->cf_info.parent_loop.has_divergent_branch = true;
8965       ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
8966       break;
8967    case nir_jump_continue:
8968       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
8969       add_logical_edge(idx, logical_target);
8970       ctx->block->kind |= block_kind_continue;
8971
8972       if (ctx->cf_info.parent_if.is_divergent) {
8973          /* for potential uniform breaks after this continue,
8974             we must ensure that they are handled correctly */
8975          ctx->cf_info.parent_loop.has_divergent_continue = true;
8976          ctx->cf_info.parent_loop.has_divergent_branch = true;
8977          ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
8978       } else {
8979          /* uniform continue - directly jump to the loop header */
8980          ctx->block->kind |= block_kind_uniform;
8981          ctx->cf_info.has_branch = true;
8982          bld.branch(aco_opcode::p_branch);
8983          add_linear_edge(idx, logical_target);
8984          return;
8985       }
8986       break;
8987    default:
8988       fprintf(stderr, "Unknown NIR jump instr: ");
8989       nir_print_instr(&instr->instr, stderr);
8990       fprintf(stderr, "\n");
8991       abort();
8992    }
8993
8994    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
8995       ctx->cf_info.exec_potentially_empty_break = true;
8996       ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth;
8997    }
8998
8999    /* remove critical edges from linear CFG */
9000    bld.branch(aco_opcode::p_branch);
9001    Block* break_block = ctx->program->create_and_insert_block();
9002    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9003    break_block->kind |= block_kind_uniform;
9004    add_linear_edge(idx, break_block);
9005    /* the loop_header pointer might be invalidated by this point */
9006    if (instr->type == nir_jump_continue)
9007       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9008    add_linear_edge(break_block->index, logical_target);
9009    bld.reset(break_block);
9010    bld.branch(aco_opcode::p_branch);
9011
9012    Block* continue_block = ctx->program->create_and_insert_block();
9013    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9014    add_linear_edge(idx, continue_block);
9015    append_logical_start(continue_block);
9016    ctx->block = continue_block;
9017    return;
9018 }
9019
9020 void visit_block(isel_context *ctx, nir_block *block)
9021 {
9022    nir_foreach_instr(instr, block) {
9023       switch (instr->type) {
9024       case nir_instr_type_alu:
9025          visit_alu_instr(ctx, nir_instr_as_alu(instr));
9026          break;
9027       case nir_instr_type_load_const:
9028          visit_load_const(ctx, nir_instr_as_load_const(instr));
9029          break;
9030       case nir_instr_type_intrinsic:
9031          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
9032          break;
9033       case nir_instr_type_tex:
9034          visit_tex(ctx, nir_instr_as_tex(instr));
9035          break;
9036       case nir_instr_type_phi:
9037          visit_phi(ctx, nir_instr_as_phi(instr));
9038          break;
9039       case nir_instr_type_ssa_undef:
9040          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
9041          break;
9042       case nir_instr_type_deref:
9043          break;
9044       case nir_instr_type_jump:
9045          visit_jump(ctx, nir_instr_as_jump(instr));
9046          break;
9047       default:
9048          fprintf(stderr, "Unknown NIR instr type: ");
9049          nir_print_instr(instr, stderr);
9050          fprintf(stderr, "\n");
9051          //abort();
9052       }
9053    }
9054
9055    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9056       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
9057 }
9058
9059
9060
9061 static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last,
9062                                     aco_ptr<Instruction>& header_phi, Operand *vals)
9063 {
9064    vals[0] = Operand(header_phi->definitions[0].getTemp());
9065    RegClass rc = vals[0].regClass();
9066
9067    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
9068
9069    unsigned next_pred = 1;
9070
9071    for (unsigned idx = first + 1; idx <= last; idx++) {
9072       Block& block = ctx->program->blocks[idx];
9073       if (block.loop_nest_depth != loop_nest_depth) {
9074          vals[idx - first] = vals[idx - 1 - first];
9075          continue;
9076       }
9077
9078       if (block.kind & block_kind_continue) {
9079          vals[idx - first] = header_phi->operands[next_pred];
9080          next_pred++;
9081          continue;
9082       }
9083
9084       bool all_same = true;
9085       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
9086          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
9087
9088       Operand val;
9089       if (all_same) {
9090          val = vals[block.linear_preds[0] - first];
9091       } else {
9092          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
9093             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
9094          for (unsigned i = 0; i < block.linear_preds.size(); i++)
9095             phi->operands[i] = vals[block.linear_preds[i] - first];
9096          val = Operand(Temp(ctx->program->allocateId(), rc));
9097          phi->definitions[0] = Definition(val.getTemp());
9098          block.instructions.emplace(block.instructions.begin(), std::move(phi));
9099       }
9100       vals[idx - first] = val;
9101    }
9102
9103    return vals[last - first];
9104 }
9105
9106 static void visit_loop(isel_context *ctx, nir_loop *loop)
9107 {
9108    //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9109    append_logical_end(ctx->block);
9110    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9111    Builder bld(ctx->program, ctx->block);
9112    bld.branch(aco_opcode::p_branch);
9113    unsigned loop_preheader_idx = ctx->block->index;
9114
9115    Block loop_exit = Block();
9116    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9117    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9118
9119    Block* loop_header = ctx->program->create_and_insert_block();
9120    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
9121    loop_header->kind |= block_kind_loop_header;
9122    add_edge(loop_preheader_idx, loop_header);
9123    ctx->block = loop_header;
9124
9125    /* emit loop body */
9126    unsigned loop_header_idx = loop_header->index;
9127    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
9128    append_logical_start(ctx->block);
9129    bool unreachable = visit_cf_list(ctx, &loop->body);
9130
9131    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
9132    if (!ctx->cf_info.has_branch) {
9133       append_logical_end(ctx->block);
9134       if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
9135          /* Discards can result in code running with an empty exec mask.
9136           * This would result in divergent breaks not ever being taken. As a
9137           * workaround, break the loop when the loop mask is empty instead of
9138           * always continuing. */
9139          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9140          unsigned block_idx = ctx->block->index;
9141
9142          /* create helper blocks to avoid critical edges */
9143          Block *break_block = ctx->program->create_and_insert_block();
9144          break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9145          break_block->kind = block_kind_uniform;
9146          bld.reset(break_block);
9147          bld.branch(aco_opcode::p_branch);
9148          add_linear_edge(block_idx, break_block);
9149          add_linear_edge(break_block->index, &loop_exit);
9150
9151          Block *continue_block = ctx->program->create_and_insert_block();
9152          continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9153          continue_block->kind = block_kind_uniform;
9154          bld.reset(continue_block);
9155          bld.branch(aco_opcode::p_branch);
9156          add_linear_edge(block_idx, continue_block);
9157          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9158
9159          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9160             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9161          ctx->block = &ctx->program->blocks[block_idx];
9162       } else {
9163          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9164          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9165             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9166          else
9167             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9168       }
9169
9170       bld.reset(ctx->block);
9171       bld.branch(aco_opcode::p_branch);
9172    }
9173
9174    /* Fixup phis in loop header from unreachable blocks.
9175     * has_branch/has_divergent_branch also indicates if the loop ends with a
9176     * break/continue instruction, but we don't emit those if unreachable=true */
9177    if (unreachable) {
9178       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
9179       bool linear = ctx->cf_info.has_branch;
9180       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
9181       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9182          if ((logical && instr->opcode == aco_opcode::p_phi) ||
9183              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
9184             /* the last operand should be the one that needs to be removed */
9185             instr->operands.pop_back();
9186          } else if (!is_phi(instr)) {
9187             break;
9188          }
9189       }
9190    }
9191
9192    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
9193     * and the previous one shouldn't both happen at once because a break in the
9194     * merge block would get CSE'd */
9195    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
9196       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
9197       Operand vals[num_vals];
9198       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9199          if (instr->opcode == aco_opcode::p_linear_phi) {
9200             if (ctx->cf_info.has_branch)
9201                instr->operands.pop_back();
9202             else
9203                instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
9204          } else if (!is_phi(instr)) {
9205             break;
9206          }
9207       }
9208    }
9209
9210    ctx->cf_info.has_branch = false;
9211
9212    // TODO: if the loop has not a single exit, we must add one °°
9213    /* emit loop successor block */
9214    ctx->block = ctx->program->insert_block(std::move(loop_exit));
9215    append_logical_start(ctx->block);
9216
9217    #if 0
9218    // TODO: check if it is beneficial to not branch on continues
9219    /* trim linear phis in loop header */
9220    for (auto&& instr : loop_entry->instructions) {
9221       if (instr->opcode == aco_opcode::p_linear_phi) {
9222          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
9223          new_phi->definitions[0] = instr->definitions[0];
9224          for (unsigned i = 0; i < new_phi->operands.size(); i++)
9225             new_phi->operands[i] = instr->operands[i];
9226          /* check that the remaining operands are all the same */
9227          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
9228             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
9229          instr.swap(new_phi);
9230       } else if (instr->opcode == aco_opcode::p_phi) {
9231          continue;
9232       } else {
9233          break;
9234       }
9235    }
9236    #endif
9237 }
9238
9239 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
9240 {
9241    ic->cond = cond;
9242
9243    append_logical_end(ctx->block);
9244    ctx->block->kind |= block_kind_branch;
9245
9246    /* branch to linear then block */
9247    assert(cond.regClass() == ctx->program->lane_mask);
9248    aco_ptr<Pseudo_branch_instruction> branch;
9249    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
9250    branch->operands[0] = Operand(cond);
9251    ctx->block->instructions.push_back(std::move(branch));
9252
9253    ic->BB_if_idx = ctx->block->index;
9254    ic->BB_invert = Block();
9255    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9256    /* Invert blocks are intentionally not marked as top level because they
9257     * are not part of the logical cfg. */
9258    ic->BB_invert.kind |= block_kind_invert;
9259    ic->BB_endif = Block();
9260    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9261    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
9262
9263    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
9264    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
9265    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
9266    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
9267    ctx->cf_info.parent_if.is_divergent = true;
9268
9269    /* divergent branches use cbranch_execz */
9270    ctx->cf_info.exec_potentially_empty_discard = false;
9271    ctx->cf_info.exec_potentially_empty_break = false;
9272    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9273
9274    /** emit logical then block */
9275    Block* BB_then_logical = ctx->program->create_and_insert_block();
9276    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9277    add_edge(ic->BB_if_idx, BB_then_logical);
9278    ctx->block = BB_then_logical;
9279    append_logical_start(BB_then_logical);
9280 }
9281
9282 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
9283 {
9284    Block *BB_then_logical = ctx->block;
9285    append_logical_end(BB_then_logical);
9286     /* branch from logical then block to invert block */
9287    aco_ptr<Pseudo_branch_instruction> branch;
9288    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9289    BB_then_logical->instructions.emplace_back(std::move(branch));
9290    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
9291    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9292       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
9293    BB_then_logical->kind |= block_kind_uniform;
9294    assert(!ctx->cf_info.has_branch);
9295    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9296    ctx->cf_info.parent_loop.has_divergent_branch = false;
9297
9298    /** emit linear then block */
9299    Block* BB_then_linear = ctx->program->create_and_insert_block();
9300    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9301    BB_then_linear->kind |= block_kind_uniform;
9302    add_linear_edge(ic->BB_if_idx, BB_then_linear);
9303    /* branch from linear then block to invert block */
9304    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9305    BB_then_linear->instructions.emplace_back(std::move(branch));
9306    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
9307
9308    /** emit invert merge block */
9309    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
9310    ic->invert_idx = ctx->block->index;
9311
9312    /* branch to linear else block (skip else) */
9313    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
9314    branch->operands[0] = Operand(ic->cond);
9315    ctx->block->instructions.push_back(std::move(branch));
9316
9317    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
9318    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
9319    ic->exec_potentially_empty_break_depth_old =
9320       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9321    /* divergent branches use cbranch_execz */
9322    ctx->cf_info.exec_potentially_empty_discard = false;
9323    ctx->cf_info.exec_potentially_empty_break = false;
9324    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9325
9326    /** emit logical else block */
9327    Block* BB_else_logical = ctx->program->create_and_insert_block();
9328    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9329    add_logical_edge(ic->BB_if_idx, BB_else_logical);
9330    add_linear_edge(ic->invert_idx, BB_else_logical);
9331    ctx->block = BB_else_logical;
9332    append_logical_start(BB_else_logical);
9333 }
9334
9335 static void end_divergent_if(isel_context *ctx, if_context *ic)
9336 {
9337    Block *BB_else_logical = ctx->block;
9338    append_logical_end(BB_else_logical);
9339
9340    /* branch from logical else block to endif block */
9341    aco_ptr<Pseudo_branch_instruction> branch;
9342    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9343    BB_else_logical->instructions.emplace_back(std::move(branch));
9344    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
9345    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9346       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
9347    BB_else_logical->kind |= block_kind_uniform;
9348
9349    assert(!ctx->cf_info.has_branch);
9350    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9351
9352
9353    /** emit linear else block */
9354    Block* BB_else_linear = ctx->program->create_and_insert_block();
9355    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9356    BB_else_linear->kind |= block_kind_uniform;
9357    add_linear_edge(ic->invert_idx, BB_else_linear);
9358
9359    /* branch from linear else block to endif block */
9360    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9361    BB_else_linear->instructions.emplace_back(std::move(branch));
9362    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
9363
9364
9365    /** emit endif merge block */
9366    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9367    append_logical_start(ctx->block);
9368
9369
9370    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
9371    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
9372    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
9373    ctx->cf_info.exec_potentially_empty_break_depth =
9374       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9375    if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
9376        !ctx->cf_info.parent_if.is_divergent) {
9377       ctx->cf_info.exec_potentially_empty_break = false;
9378       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9379    }
9380    /* uniform control flow never has an empty exec-mask */
9381    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
9382       ctx->cf_info.exec_potentially_empty_discard = false;
9383       ctx->cf_info.exec_potentially_empty_break = false;
9384       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9385    }
9386 }
9387
9388 static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
9389 {
9390    assert(cond.regClass() == s1);
9391
9392    append_logical_end(ctx->block);
9393    ctx->block->kind |= block_kind_uniform;
9394
9395    aco_ptr<Pseudo_branch_instruction> branch;
9396    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
9397    branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
9398    branch->operands[0] = Operand(cond);
9399    branch->operands[0].setFixed(scc);
9400    ctx->block->instructions.emplace_back(std::move(branch));
9401
9402    ic->BB_if_idx = ctx->block->index;
9403    ic->BB_endif = Block();
9404    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9405    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
9406
9407    ctx->cf_info.has_branch = false;
9408    ctx->cf_info.parent_loop.has_divergent_branch = false;
9409
9410    /** emit then block */
9411    Block* BB_then = ctx->program->create_and_insert_block();
9412    BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9413    add_edge(ic->BB_if_idx, BB_then);
9414    append_logical_start(BB_then);
9415    ctx->block = BB_then;
9416 }
9417
9418 static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
9419 {
9420    Block *BB_then = ctx->block;
9421
9422    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
9423    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9424
9425    if (!ic->uniform_has_then_branch) {
9426       append_logical_end(BB_then);
9427       /* branch from then block to endif block */
9428       aco_ptr<Pseudo_branch_instruction> branch;
9429       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9430       BB_then->instructions.emplace_back(std::move(branch));
9431       add_linear_edge(BB_then->index, &ic->BB_endif);
9432       if (!ic->then_branch_divergent)
9433          add_logical_edge(BB_then->index, &ic->BB_endif);
9434       BB_then->kind |= block_kind_uniform;
9435    }
9436
9437    ctx->cf_info.has_branch = false;
9438    ctx->cf_info.parent_loop.has_divergent_branch = false;
9439
9440    /** emit else block */
9441    Block* BB_else = ctx->program->create_and_insert_block();
9442    BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9443    add_edge(ic->BB_if_idx, BB_else);
9444    append_logical_start(BB_else);
9445    ctx->block = BB_else;
9446 }
9447
9448 static void end_uniform_if(isel_context *ctx, if_context *ic)
9449 {
9450    Block *BB_else = ctx->block;
9451
9452    if (!ctx->cf_info.has_branch) {
9453       append_logical_end(BB_else);
9454       /* branch from then block to endif block */
9455       aco_ptr<Pseudo_branch_instruction> branch;
9456       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9457       BB_else->instructions.emplace_back(std::move(branch));
9458       add_linear_edge(BB_else->index, &ic->BB_endif);
9459       if (!ctx->cf_info.parent_loop.has_divergent_branch)
9460          add_logical_edge(BB_else->index, &ic->BB_endif);
9461       BB_else->kind |= block_kind_uniform;
9462    }
9463
9464    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
9465    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9466
9467    /** emit endif merge block */
9468    if (!ctx->cf_info.has_branch) {
9469       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9470       append_logical_start(ctx->block);
9471    }
9472 }
9473
9474 static bool visit_if(isel_context *ctx, nir_if *if_stmt)
9475 {
9476    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
9477    Builder bld(ctx->program, ctx->block);
9478    aco_ptr<Pseudo_branch_instruction> branch;
9479    if_context ic;
9480
9481    if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
9482       /**
9483        * Uniform conditionals are represented in the following way*) :
9484        *
9485        * The linear and logical CFG:
9486        *                        BB_IF
9487        *                        /    \
9488        *       BB_THEN (logical)      BB_ELSE (logical)
9489        *                        \    /
9490        *                        BB_ENDIF
9491        *
9492        * *) Exceptions may be due to break and continue statements within loops
9493        *    If a break/continue happens within uniform control flow, it branches
9494        *    to the loop exit/entry block. Otherwise, it branches to the next
9495        *    merge block.
9496        **/
9497
9498       // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
9499       assert(cond.regClass() == ctx->program->lane_mask);
9500       cond = bool_to_scalar_condition(ctx, cond);
9501
9502       begin_uniform_if_then(ctx, &ic, cond);
9503       visit_cf_list(ctx, &if_stmt->then_list);
9504
9505       begin_uniform_if_else(ctx, &ic);
9506       visit_cf_list(ctx, &if_stmt->else_list);
9507
9508       end_uniform_if(ctx, &ic);
9509
9510       return !ctx->cf_info.has_branch;
9511    } else { /* non-uniform condition */
9512       /**
9513        * To maintain a logical and linear CFG without critical edges,
9514        * non-uniform conditionals are represented in the following way*) :
9515        *
9516        * The linear CFG:
9517        *                        BB_IF
9518        *                        /    \
9519        *       BB_THEN (logical)      BB_THEN (linear)
9520        *                        \    /
9521        *                        BB_INVERT (linear)
9522        *                        /    \
9523        *       BB_ELSE (logical)      BB_ELSE (linear)
9524        *                        \    /
9525        *                        BB_ENDIF
9526        *
9527        * The logical CFG:
9528        *                        BB_IF
9529        *                        /    \
9530        *       BB_THEN (logical)      BB_ELSE (logical)
9531        *                        \    /
9532        *                        BB_ENDIF
9533        *
9534        * *) Exceptions may be due to break and continue statements within loops
9535        **/
9536
9537       begin_divergent_if_then(ctx, &ic, cond);
9538       visit_cf_list(ctx, &if_stmt->then_list);
9539
9540       begin_divergent_if_else(ctx, &ic);
9541       visit_cf_list(ctx, &if_stmt->else_list);
9542
9543       end_divergent_if(ctx, &ic);
9544
9545       return true;
9546    }
9547 }
9548
9549 static bool visit_cf_list(isel_context *ctx,
9550                           struct exec_list *list)
9551 {
9552    foreach_list_typed(nir_cf_node, node, node, list) {
9553       switch (node->type) {
9554       case nir_cf_node_block:
9555          visit_block(ctx, nir_cf_node_as_block(node));
9556          break;
9557       case nir_cf_node_if:
9558          if (!visit_if(ctx, nir_cf_node_as_if(node)))
9559             return true;
9560          break;
9561       case nir_cf_node_loop:
9562          visit_loop(ctx, nir_cf_node_as_loop(node));
9563          break;
9564       default:
9565          unreachable("unimplemented cf list type");
9566       }
9567    }
9568    return false;
9569 }
9570
9571 static void create_null_export(isel_context *ctx)
9572 {
9573    /* Some shader stages always need to have exports.
9574     * So when there is none, we need to add a null export.
9575     */
9576
9577    unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
9578    bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10;
9579    Builder bld(ctx->program, ctx->block);
9580    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
9581            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm);
9582 }
9583
9584 static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
9585 {
9586    assert(ctx->stage == vertex_vs ||
9587           ctx->stage == tess_eval_vs ||
9588           ctx->stage == gs_copy_vs ||
9589           ctx->stage == ngg_vertex_gs ||
9590           ctx->stage == ngg_tess_eval_gs);
9591
9592    int offset = (ctx->stage & sw_tes)
9593                 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
9594                 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
9595    uint64_t mask = ctx->outputs.mask[slot];
9596    if (!is_pos && !mask)
9597       return false;
9598    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
9599       return false;
9600    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9601    exp->enabled_mask = mask;
9602    for (unsigned i = 0; i < 4; ++i) {
9603       if (mask & (1 << i))
9604          exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
9605       else
9606          exp->operands[i] = Operand(v1);
9607    }
9608    /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
9609     * Setting valid_mask=1 prevents it and has no other effect.
9610     */
9611    exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
9612    exp->done = false;
9613    exp->compressed = false;
9614    if (is_pos)
9615       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9616    else
9617       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
9618    ctx->block->instructions.emplace_back(std::move(exp));
9619
9620    return true;
9621 }
9622
9623 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
9624 {
9625    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9626    exp->enabled_mask = 0;
9627    for (unsigned i = 0; i < 4; ++i)
9628       exp->operands[i] = Operand(v1);
9629    if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
9630       exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
9631       exp->enabled_mask |= 0x1;
9632    }
9633    if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
9634       exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
9635       exp->enabled_mask |= 0x4;
9636    }
9637    if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
9638       if (ctx->options->chip_class < GFX9) {
9639          exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
9640          exp->enabled_mask |= 0x8;
9641       } else {
9642          Builder bld(ctx->program, ctx->block);
9643
9644          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
9645                              Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
9646          if (exp->operands[2].isTemp())
9647             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
9648
9649          exp->operands[2] = Operand(out);
9650          exp->enabled_mask |= 0x4;
9651       }
9652    }
9653    exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
9654    exp->done = false;
9655    exp->compressed = false;
9656    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9657    ctx->block->instructions.emplace_back(std::move(exp));
9658 }
9659
9660 static void create_export_phis(isel_context *ctx)
9661 {
9662    /* Used when exports are needed, but the output temps are defined in a preceding block.
9663     * This function will set up phis in order to access the outputs in the next block.
9664     */
9665
9666    assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start);
9667    aco_ptr<Instruction> logical_start = aco_ptr<Instruction>(ctx->block->instructions.back().release());
9668    ctx->block->instructions.pop_back();
9669
9670    Builder bld(ctx->program, ctx->block);
9671
9672    for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) {
9673       uint64_t mask = ctx->outputs.mask[slot];
9674       for (unsigned i = 0; i < 4; ++i) {
9675          if (!(mask & (1 << i)))
9676             continue;
9677
9678          Temp old = ctx->outputs.temps[slot * 4 + i];
9679          Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1));
9680          ctx->outputs.temps[slot * 4 + i] = phi;
9681       }
9682    }
9683
9684    bld.insert(std::move(logical_start));
9685 }
9686
9687 static void create_vs_exports(isel_context *ctx)
9688 {
9689    assert(ctx->stage == vertex_vs ||
9690           ctx->stage == tess_eval_vs ||
9691           ctx->stage == gs_copy_vs ||
9692           ctx->stage == ngg_vertex_gs ||
9693           ctx->stage == ngg_tess_eval_gs);
9694
9695    radv_vs_output_info *outinfo = (ctx->stage & sw_tes)
9696                                   ? &ctx->program->info->tes.outinfo
9697                                   : &ctx->program->info->vs.outinfo;
9698
9699    if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
9700       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
9701       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
9702    }
9703
9704    if (ctx->options->key.has_multiview_view_index) {
9705       ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
9706       ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
9707    }
9708
9709    /* the order these position exports are created is important */
9710    int next_pos = 0;
9711    bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
9712    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
9713       export_vs_psiz_layer_viewport(ctx, &next_pos);
9714       exported_pos = true;
9715    }
9716    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9717       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
9718    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9719       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
9720
9721    if (ctx->export_clip_dists) {
9722       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9723          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
9724       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9725          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
9726    }
9727
9728    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
9729       if (i < VARYING_SLOT_VAR0 &&
9730           i != VARYING_SLOT_LAYER &&
9731           i != VARYING_SLOT_PRIMITIVE_ID)
9732          continue;
9733
9734       export_vs_varying(ctx, i, false, NULL);
9735    }
9736
9737    if (!exported_pos)
9738       create_null_export(ctx);
9739 }
9740
9741 static bool export_fs_mrt_z(isel_context *ctx)
9742 {
9743    Builder bld(ctx->program, ctx->block);
9744    unsigned enabled_channels = 0;
9745    bool compr = false;
9746    Operand values[4];
9747
9748    for (unsigned i = 0; i < 4; ++i) {
9749       values[i] = Operand(v1);
9750    }
9751
9752    /* Both stencil and sample mask only need 16-bits. */
9753    if (!ctx->program->info->ps.writes_z &&
9754        (ctx->program->info->ps.writes_stencil ||
9755         ctx->program->info->ps.writes_sample_mask)) {
9756       compr = true; /* COMPR flag */
9757
9758       if (ctx->program->info->ps.writes_stencil) {
9759          /* Stencil should be in X[23:16]. */
9760          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
9761          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
9762          enabled_channels |= 0x3;
9763       }
9764
9765       if (ctx->program->info->ps.writes_sample_mask) {
9766          /* SampleMask should be in Y[15:0]. */
9767          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
9768          enabled_channels |= 0xc;
9769      }
9770    } else {
9771       if (ctx->program->info->ps.writes_z) {
9772          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
9773          enabled_channels |= 0x1;
9774       }
9775
9776       if (ctx->program->info->ps.writes_stencil) {
9777          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
9778          enabled_channels |= 0x2;
9779       }
9780
9781       if (ctx->program->info->ps.writes_sample_mask) {
9782          values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
9783          enabled_channels |= 0x4;
9784       }
9785    }
9786
9787    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
9788     * writemask component.
9789     */
9790    if (ctx->options->chip_class == GFX6 &&
9791        ctx->options->family != CHIP_OLAND &&
9792        ctx->options->family != CHIP_HAINAN) {
9793             enabled_channels |= 0x1;
9794    }
9795
9796    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
9797            enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
9798
9799    return true;
9800 }
9801
9802 static bool export_fs_mrt_color(isel_context *ctx, int slot)
9803 {
9804    Builder bld(ctx->program, ctx->block);
9805    unsigned write_mask = ctx->outputs.mask[slot];
9806    Operand values[4];
9807
9808    for (unsigned i = 0; i < 4; ++i) {
9809       if (write_mask & (1 << i)) {
9810          values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
9811       } else {
9812          values[i] = Operand(v1);
9813       }
9814    }
9815
9816    unsigned target, col_format;
9817    unsigned enabled_channels = 0;
9818    aco_opcode compr_op = (aco_opcode)0;
9819
9820    slot -= FRAG_RESULT_DATA0;
9821    target = V_008DFC_SQ_EXP_MRT + slot;
9822    col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
9823
9824    bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
9825    bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
9826
9827    switch (col_format)
9828    {
9829    case V_028714_SPI_SHADER_ZERO:
9830       enabled_channels = 0; /* writemask */
9831       target = V_008DFC_SQ_EXP_NULL;
9832       break;
9833
9834    case V_028714_SPI_SHADER_32_R:
9835       enabled_channels = 1;
9836       break;
9837
9838    case V_028714_SPI_SHADER_32_GR:
9839       enabled_channels = 0x3;
9840       break;
9841
9842    case V_028714_SPI_SHADER_32_AR:
9843       if (ctx->options->chip_class >= GFX10) {
9844          /* Special case: on GFX10, the outputs are different for 32_AR */
9845          enabled_channels = 0x3;
9846          values[1] = values[3];
9847          values[3] = Operand(v1);
9848       } else {
9849          enabled_channels = 0x9;
9850       }
9851       break;
9852
9853    case V_028714_SPI_SHADER_FP16_ABGR:
9854       enabled_channels = 0x5;
9855       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
9856       break;
9857
9858    case V_028714_SPI_SHADER_UNORM16_ABGR:
9859       enabled_channels = 0x5;
9860       compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
9861       break;
9862
9863    case V_028714_SPI_SHADER_SNORM16_ABGR:
9864       enabled_channels = 0x5;
9865       compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
9866       break;
9867
9868    case V_028714_SPI_SHADER_UINT16_ABGR: {
9869       enabled_channels = 0x5;
9870       compr_op = aco_opcode::v_cvt_pk_u16_u32;
9871       if (is_int8 || is_int10) {
9872          /* clamp */
9873          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
9874          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
9875
9876          for (unsigned i = 0; i < 4; i++) {
9877             if ((write_mask >> i) & 1) {
9878                values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
9879                                     i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
9880                                     values[i]);
9881             }
9882          }
9883       }
9884       break;
9885    }
9886
9887    case V_028714_SPI_SHADER_SINT16_ABGR:
9888       enabled_channels = 0x5;
9889       compr_op = aco_opcode::v_cvt_pk_i16_i32;
9890       if (is_int8 || is_int10) {
9891          /* clamp */
9892          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
9893          uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
9894          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
9895          Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
9896
9897          for (unsigned i = 0; i < 4; i++) {
9898             if ((write_mask >> i) & 1) {
9899                values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
9900                                     i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
9901                                     values[i]);
9902                values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
9903                                     i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
9904                                     values[i]);
9905             }
9906          }
9907       }
9908       break;
9909
9910    case V_028714_SPI_SHADER_32_ABGR:
9911       enabled_channels = 0xF;
9912       break;
9913
9914    default:
9915       break;
9916    }
9917
9918    if (target == V_008DFC_SQ_EXP_NULL)
9919       return false;
9920
9921    if ((bool) compr_op) {
9922       for (int i = 0; i < 2; i++) {
9923          /* check if at least one of the values to be compressed is enabled */
9924          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
9925          if (enabled) {
9926             enabled_channels |= enabled << (i*2);
9927             values[i] = bld.vop3(compr_op, bld.def(v1),
9928                                  values[i*2].isUndefined() ? Operand(0u) : values[i*2],
9929                                  values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
9930          } else {
9931             values[i] = Operand(v1);
9932          }
9933       }
9934       values[2] = Operand(v1);
9935       values[3] = Operand(v1);
9936    } else {
9937       for (int i = 0; i < 4; i++)
9938          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
9939    }
9940
9941    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
9942            enabled_channels, target, (bool) compr_op);
9943    return true;
9944 }
9945
9946 static void create_fs_exports(isel_context *ctx)
9947 {
9948    bool exported = false;
9949
9950    /* Export depth, stencil and sample mask. */
9951    if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
9952        ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
9953        ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
9954       exported |= export_fs_mrt_z(ctx);
9955
9956    /* Export all color render targets. */
9957    for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
9958       if (ctx->outputs.mask[i])
9959          exported |= export_fs_mrt_color(ctx, i);
9960
9961    if (!exported)
9962       create_null_export(ctx);
9963 }
9964
9965 static void write_tcs_tess_factors(isel_context *ctx)
9966 {
9967    unsigned outer_comps;
9968    unsigned inner_comps;
9969
9970    switch (ctx->args->options->key.tcs.primitive_mode) {
9971    case GL_ISOLINES:
9972       outer_comps = 2;
9973       inner_comps = 0;
9974       break;
9975    case GL_TRIANGLES:
9976       outer_comps = 3;
9977       inner_comps = 1;
9978       break;
9979    case GL_QUADS:
9980       outer_comps = 4;
9981       inner_comps = 2;
9982       break;
9983    default:
9984       return;
9985    }
9986
9987    Builder bld(ctx->program, ctx->block);
9988
9989    bld.barrier(aco_opcode::p_memory_barrier_shared);
9990    if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
9991       bld.sopp(aco_opcode::s_barrier);
9992
9993    Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
9994    Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
9995
9996    Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id);
9997    if_context ic_invocation_id_is_zero;
9998    begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero);
9999    bld.reset(ctx->block);
10000
10001    Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u));
10002
10003    std::pair<Temp, unsigned> lds_base = get_tcs_output_lds_offset(ctx);
10004    unsigned stride = inner_comps + outer_comps;
10005    unsigned lds_align = calculate_lds_alignment(ctx, lds_base.second);
10006    Temp tf_inner_vec;
10007    Temp tf_outer_vec;
10008    Temp out[6];
10009    assert(stride <= (sizeof(out) / sizeof(Temp)));
10010
10011    if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
10012       // LINES reversal
10013       tf_outer_vec = load_lds(ctx, 4, bld.tmp(v2), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10014       out[1] = emit_extract_vector(ctx, tf_outer_vec, 0, v1);
10015       out[0] = emit_extract_vector(ctx, tf_outer_vec, 1, v1);
10016    } else {
10017       tf_outer_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, outer_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10018       tf_inner_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, inner_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_in_loc, lds_align);
10019
10020       for (unsigned i = 0; i < outer_comps; ++i)
10021          out[i] = emit_extract_vector(ctx, tf_outer_vec, i, v1);
10022       for (unsigned i = 0; i < inner_comps; ++i)
10023          out[outer_comps + i] = emit_extract_vector(ctx, tf_inner_vec, i, v1);
10024    }
10025
10026    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
10027    Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset);
10028    Temp byte_offset = bld.v_mul_imm(bld.def(v1), rel_patch_id, stride * 4u);
10029    unsigned tf_const_offset = 0;
10030
10031    if (ctx->program->chip_class <= GFX8) {
10032       Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id);
10033       if_context ic_rel_patch_id_is_zero;
10034       begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero);
10035       bld.reset(ctx->block);
10036
10037       /* Store the dynamic HS control word. */
10038       Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
10039       bld.mubuf(aco_opcode::buffer_store_dword,
10040                 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
10041                 /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
10042                 /* disable_wqm */ false, /* glc */ true);
10043       tf_const_offset += 4;
10044
10045       begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
10046       end_divergent_if(ctx, &ic_rel_patch_id_is_zero);
10047       bld.reset(ctx->block);
10048    }
10049
10050    assert(stride == 2 || stride == 4 || stride == 6);
10051    Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
10052    store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
10053
10054    /* Store to offchip for TES to read - only if TES reads them */
10055    if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
10056       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
10057       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
10058
10059       std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
10060       store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
10061
10062       if (likely(inner_comps)) {
10063          std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
10064          store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
10065       }
10066    }
10067
10068    begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
10069    end_divergent_if(ctx, &ic_invocation_id_is_zero);
10070 }
10071
10072 static void emit_stream_output(isel_context *ctx,
10073                                Temp const *so_buffers,
10074                                Temp const *so_write_offset,
10075                                const struct radv_stream_output *output)
10076 {
10077    unsigned num_comps = util_bitcount(output->component_mask);
10078    unsigned writemask = (1 << num_comps) - 1;
10079    unsigned loc = output->location;
10080    unsigned buf = output->buffer;
10081
10082    assert(num_comps && num_comps <= 4);
10083    if (!num_comps || num_comps > 4)
10084       return;
10085
10086    unsigned start = ffs(output->component_mask) - 1;
10087
10088    Temp out[4];
10089    bool all_undef = true;
10090    assert(ctx->stage == vertex_vs || ctx->stage == gs_copy_vs);
10091    for (unsigned i = 0; i < num_comps; i++) {
10092       out[i] = ctx->outputs.temps[loc * 4 + start + i];
10093       all_undef = all_undef && !out[i].id();
10094    }
10095    if (all_undef)
10096       return;
10097
10098    while (writemask) {
10099       int start, count;
10100       u_bit_scan_consecutive_range(&writemask, &start, &count);
10101       if (count == 3 && ctx->options->chip_class == GFX6) {
10102          /* GFX6 doesn't support storing vec3, split it. */
10103          writemask |= 1u << (start + 2);
10104          count = 2;
10105       }
10106
10107       unsigned offset = output->offset + start * 4;
10108
10109       Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
10110       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
10111       for (int i = 0; i < count; ++i)
10112          vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
10113       vec->definitions[0] = Definition(write_data);
10114       ctx->block->instructions.emplace_back(std::move(vec));
10115
10116       aco_opcode opcode;
10117       switch (count) {
10118       case 1:
10119          opcode = aco_opcode::buffer_store_dword;
10120          break;
10121       case 2:
10122          opcode = aco_opcode::buffer_store_dwordx2;
10123          break;
10124       case 3:
10125          opcode = aco_opcode::buffer_store_dwordx3;
10126          break;
10127       case 4:
10128          opcode = aco_opcode::buffer_store_dwordx4;
10129          break;
10130       default:
10131          unreachable("Unsupported dword count.");
10132       }
10133
10134       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
10135       store->operands[0] = Operand(so_buffers[buf]);
10136       store->operands[1] = Operand(so_write_offset[buf]);
10137       store->operands[2] = Operand((uint32_t) 0);
10138       store->operands[3] = Operand(write_data);
10139       if (offset > 4095) {
10140          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
10141          Builder bld(ctx->program, ctx->block);
10142          store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
10143       } else {
10144          store->offset = offset;
10145       }
10146       store->offen = true;
10147       store->glc = true;
10148       store->dlc = false;
10149       store->slc = true;
10150       store->can_reorder = true;
10151       ctx->block->instructions.emplace_back(std::move(store));
10152    }
10153 }
10154
10155 static void emit_streamout(isel_context *ctx, unsigned stream)
10156 {
10157    Builder bld(ctx->program, ctx->block);
10158
10159    Temp so_buffers[4];
10160    Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
10161    for (unsigned i = 0; i < 4; i++) {
10162       unsigned stride = ctx->program->info->so.strides[i];
10163       if (!stride)
10164          continue;
10165
10166       Operand off = bld.copy(bld.def(s1), Operand(i * 16u));
10167       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
10168    }
10169
10170    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10171                                 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
10172
10173    Temp tid = emit_mbcnt(ctx, bld.def(v1));
10174
10175    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
10176
10177    if_context ic;
10178    begin_divergent_if_then(ctx, &ic, can_emit);
10179
10180    bld.reset(ctx->block);
10181
10182    Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
10183
10184    Temp so_write_offset[4];
10185
10186    for (unsigned i = 0; i < 4; i++) {
10187       unsigned stride = ctx->program->info->so.strides[i];
10188       if (!stride)
10189          continue;
10190
10191       if (stride == 1) {
10192          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10193                                 get_arg(ctx, ctx->args->streamout_write_idx),
10194                                 get_arg(ctx, ctx->args->streamout_offset[i]));
10195          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
10196
10197          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
10198       } else {
10199          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
10200          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
10201                                  get_arg(ctx, ctx->args->streamout_offset[i]));
10202          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
10203       }
10204    }
10205
10206    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
10207       struct radv_stream_output *output =
10208          &ctx->program->info->so.outputs[i];
10209       if (stream != output->stream)
10210          continue;
10211
10212       emit_stream_output(ctx, so_buffers, so_write_offset, output);
10213    }
10214
10215    begin_divergent_if_else(ctx, &ic);
10216    end_divergent_if(ctx, &ic);
10217 }
10218
10219 } /* end namespace */
10220
10221 void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
10222 {
10223    assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
10224    Builder bld(ctx->program, ctx->block);
10225    constexpr unsigned hs_idx = 1u;
10226    Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10227                                               get_arg(ctx, ctx->args->merged_wave_info),
10228                                               Operand((8u << 16) | (hs_idx * 8u)));
10229    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
10230
10231    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
10232
10233    Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10234                                get_arg(ctx, ctx->args->rel_auto_id),
10235                                get_arg(ctx, ctx->args->ac.instance_id),
10236                                ls_has_nonzero_hs_threads);
10237    Temp rel_auto_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10238                                get_arg(ctx, ctx->args->ac.tcs_rel_ids),
10239                                get_arg(ctx, ctx->args->rel_auto_id),
10240                                ls_has_nonzero_hs_threads);
10241    Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10242                              get_arg(ctx, ctx->args->ac.tcs_patch_id),
10243                              get_arg(ctx, ctx->args->ac.vertex_id),
10244                              ls_has_nonzero_hs_threads);
10245
10246    ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
10247    ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id;
10248    ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
10249 }
10250
10251 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
10252 {
10253    /* Split all arguments except for the first (ring_offsets) and the last
10254     * (exec) so that the dead channels don't stay live throughout the program.
10255     */
10256    for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
10257       if (startpgm->definitions[i].regClass().size() > 1) {
10258          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
10259                            startpgm->definitions[i].regClass().size());
10260       }
10261    }
10262 }
10263
10264 void handle_bc_optimize(isel_context *ctx)
10265 {
10266    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
10267    Builder bld(ctx->program, ctx->block);
10268    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
10269    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
10270    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
10271    ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
10272    ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
10273    if (uses_center && uses_centroid) {
10274       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
10275                               get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
10276
10277       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
10278          Temp new_coord[2];
10279          for (unsigned i = 0; i < 2; i++) {
10280             Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
10281             Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
10282             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10283                                     persp_centroid, persp_center, sel);
10284          }
10285          ctx->persp_centroid = bld.tmp(v2);
10286          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
10287                     Operand(new_coord[0]), Operand(new_coord[1]));
10288          emit_split_vector(ctx, ctx->persp_centroid, 2);
10289       }
10290
10291       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
10292          Temp new_coord[2];
10293          for (unsigned i = 0; i < 2; i++) {
10294             Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
10295             Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
10296             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10297                                     linear_centroid, linear_center, sel);
10298          }
10299          ctx->linear_centroid = bld.tmp(v2);
10300          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
10301                     Operand(new_coord[0]), Operand(new_coord[1]));
10302          emit_split_vector(ctx, ctx->linear_centroid, 2);
10303       }
10304    }
10305 }
10306
10307 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
10308 {
10309    Program *program = ctx->program;
10310
10311    unsigned float_controls = shader->info.float_controls_execution_mode;
10312
10313    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
10314       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
10315    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
10316       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
10317                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
10318
10319    program->next_fp_mode.must_flush_denorms32 =
10320       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
10321    program->next_fp_mode.must_flush_denorms16_64 =
10322       float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
10323                         FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
10324
10325    program->next_fp_mode.care_about_round32 =
10326       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
10327
10328    program->next_fp_mode.care_about_round16_64 =
10329       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
10330                         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
10331
10332    /* default to preserving fp16 and fp64 denorms, since it's free */
10333    if (program->next_fp_mode.must_flush_denorms16_64)
10334       program->next_fp_mode.denorm16_64 = 0;
10335    else
10336       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
10337
10338    /* preserving fp32 denorms is expensive, so only do it if asked */
10339    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
10340       program->next_fp_mode.denorm32 = fp_denorm_keep;
10341    else
10342       program->next_fp_mode.denorm32 = 0;
10343
10344    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
10345       program->next_fp_mode.round32 = fp_round_tz;
10346    else
10347       program->next_fp_mode.round32 = fp_round_ne;
10348
10349    if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
10350       program->next_fp_mode.round16_64 = fp_round_tz;
10351    else
10352       program->next_fp_mode.round16_64 = fp_round_ne;
10353
10354    ctx->block->fp_mode = program->next_fp_mode;
10355 }
10356
10357 void cleanup_cfg(Program *program)
10358 {
10359    /* create linear_succs/logical_succs */
10360    for (Block& BB : program->blocks) {
10361       for (unsigned idx : BB.linear_preds)
10362          program->blocks[idx].linear_succs.emplace_back(BB.index);
10363       for (unsigned idx : BB.logical_preds)
10364          program->blocks[idx].logical_succs.emplace_back(BB.index);
10365    }
10366 }
10367
10368 Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
10369 {
10370    Builder bld(ctx->program, ctx->block);
10371
10372    /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
10373    Temp count = i == 0
10374                 ? get_arg(ctx, ctx->args->merged_wave_info)
10375                 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
10376                            get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u));
10377
10378    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
10379    Temp cond;
10380
10381    if (ctx->program->wave_size == 64) {
10382       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
10383       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
10384       cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
10385    } else {
10386       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
10387       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
10388    }
10389
10390    return cond;
10391 }
10392
10393 bool ngg_early_prim_export(isel_context *ctx)
10394 {
10395    /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */
10396    return true;
10397 }
10398
10399 void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx)
10400 {
10401    Builder bld(ctx->program, ctx->block);
10402
10403    /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */
10404    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
10405
10406    /* Get the id of the current wave within the threadgroup (workgroup) */
10407    Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10408                                             get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10409
10410    /* Execute the following code only on the first wave (wave id 0),
10411     * use the SCC def to tell if the wave id is zero or not.
10412     */
10413    Temp cond = wave_id_in_tg.def(1).getTemp();
10414    if_context ic;
10415    begin_uniform_if_then(ctx, &ic, cond);
10416    begin_uniform_if_else(ctx, &ic);
10417    bld.reset(ctx->block);
10418
10419    /* Number of vertices output by VS/TES */
10420    Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10421                            get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u)));
10422    /* Number of primitives output by VS/TES */
10423    Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10424                            get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u)));
10425
10426    /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
10427    Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
10428    tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
10429
10430    /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
10431    bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
10432
10433    /* After the GS_ALLOC_REQ is done, reset priority to default (0). */
10434    bld.sopp(aco_opcode::s_setprio, -1u, 0x0u);
10435
10436    end_uniform_if(ctx, &ic);
10437 }
10438
10439 Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[])
10440 {
10441    Builder bld(ctx->program, ctx->block);
10442
10443    if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
10444       return get_arg(ctx, ctx->args->gs_vtx_offset[0]);
10445    }
10446
10447    Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
10448    Temp tmp;
10449
10450    for (unsigned i = 0; i < num_vertices; ++i) {
10451       assert(vtxindex[i].id());
10452
10453       if (i)
10454          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
10455       else
10456          tmp = vtxindex[i];
10457
10458       /* The initial edge flag is always false in tess eval shaders. */
10459       if (ctx->stage == ngg_vertex_gs) {
10460          Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u));
10461          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
10462       }
10463    }
10464
10465    /* TODO: Set isnull field in case of merged NGG VS+GS. */
10466
10467    return tmp;
10468 }
10469
10470 void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[])
10471 {
10472    Builder bld(ctx->program, ctx->block);
10473    Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex);
10474
10475    bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
10476         1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
10477         false /* compressed */, true/* done */, false /* valid mask */);
10478 }
10479
10480 void ngg_emit_nogs_gsthreads(isel_context *ctx)
10481 {
10482    /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
10483     * These must always come before VS exports.
10484     *
10485     * It is recommended to do these as early as possible. They can be at the beginning when
10486     * there is no SW GS and the shader doesn't write edge flags.
10487     */
10488
10489    if_context ic;
10490    Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
10491    begin_divergent_if_then(ctx, &ic, is_gs_thread);
10492
10493    Builder bld(ctx->program, ctx->block);
10494    constexpr unsigned max_vertices_per_primitive = 3;
10495    unsigned num_vertices_per_primitive = max_vertices_per_primitive;
10496
10497    if (ctx->stage == ngg_vertex_gs) {
10498       /* TODO: optimize for points & lines */
10499    } else if (ctx->stage == ngg_tess_eval_gs) {
10500       if (ctx->shader->info.tess.point_mode)
10501          num_vertices_per_primitive = 1;
10502       else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
10503          num_vertices_per_primitive = 2;
10504    } else {
10505       unreachable("Unsupported NGG shader stage");
10506    }
10507
10508    Temp vtxindex[max_vertices_per_primitive];
10509    vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10510                           get_arg(ctx, ctx->args->gs_vtx_offset[0]));
10511    vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
10512                  bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
10513                           get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u));
10514    vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
10515                  bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10516                           get_arg(ctx, ctx->args->gs_vtx_offset[2]));
10517
10518    /* Export primitive data to the index buffer. */
10519    ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
10520
10521    /* Export primitive ID. */
10522    if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) {
10523       /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
10524       Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
10525       Temp provoking_vtx_index = vtxindex[0];
10526       Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
10527
10528       store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
10529    }
10530
10531    begin_divergent_if_else(ctx, &ic);
10532    end_divergent_if(ctx, &ic);
10533 }
10534
10535 void ngg_emit_nogs_output(isel_context *ctx)
10536 {
10537    /* Emits NGG GS output, for stages that don't have SW GS. */
10538
10539    if_context ic;
10540    Builder bld(ctx->program, ctx->block);
10541    bool late_prim_export = !ngg_early_prim_export(ctx);
10542
10543    /* NGG streamout is currently disabled by default. */
10544    assert(!ctx->args->shader_info->so.num_outputs);
10545
10546    if (late_prim_export) {
10547       /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */
10548       create_export_phis(ctx);
10549       /* Do what we need to do in the GS threads. */
10550       ngg_emit_nogs_gsthreads(ctx);
10551
10552       /* What comes next should be executed on ES threads. */
10553       Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
10554       begin_divergent_if_then(ctx, &ic, is_es_thread);
10555       bld.reset(ctx->block);
10556    }
10557
10558    /* Export VS outputs */
10559    ctx->block->kind |= block_kind_export_end;
10560    create_vs_exports(ctx);
10561
10562    /* Export primitive ID */
10563    if (ctx->args->options->key.vs_common_out.export_prim_id) {
10564       Temp prim_id;
10565
10566       if (ctx->stage == ngg_vertex_gs) {
10567          /* Wait for GS threads to store primitive ID in LDS. */
10568          bld.barrier(aco_opcode::p_memory_barrier_shared);
10569          bld.sopp(aco_opcode::s_barrier);
10570
10571          /* Calculate LDS address where the GS threads stored the primitive ID. */
10572          Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10573                                        get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10574          Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1));
10575          Temp wave_id_mul = bld.v_mul_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size);
10576          Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave));
10577          Temp addr = bld.v_mul_imm(bld.def(v1), thread_id_in_tg, 4u);
10578
10579          /* Load primitive ID from LDS. */
10580          prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
10581       } else if (ctx->stage == ngg_tess_eval_gs) {
10582          /* TES: Just use the patch ID as the primitive ID. */
10583          prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
10584       } else {
10585          unreachable("unsupported NGG shader stage.");
10586       }
10587
10588       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10589       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
10590
10591       export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
10592    }
10593
10594    if (late_prim_export) {
10595       begin_divergent_if_else(ctx, &ic);
10596       end_divergent_if(ctx, &ic);
10597       bld.reset(ctx->block);
10598    }
10599 }
10600
10601 void select_program(Program *program,
10602                     unsigned shader_count,
10603                     struct nir_shader *const *shaders,
10604                     ac_shader_config* config,
10605                     struct radv_shader_args *args)
10606 {
10607    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
10608    if_context ic_merged_wave_info;
10609    bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
10610
10611    for (unsigned i = 0; i < shader_count; i++) {
10612       nir_shader *nir = shaders[i];
10613       init_context(&ctx, nir);
10614
10615       setup_fp_mode(&ctx, nir);
10616
10617       if (!i) {
10618          /* needs to be after init_context() for FS */
10619          Pseudo_instruction *startpgm = add_startpgm(&ctx);
10620          append_logical_start(ctx.block);
10621
10622          if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
10623             fix_ls_vgpr_init_bug(&ctx, startpgm);
10624
10625          split_arguments(&ctx, startpgm);
10626       }
10627
10628       if (ngg_no_gs) {
10629          ngg_emit_sendmsg_gs_alloc_req(&ctx);
10630
10631          if (ngg_early_prim_export(&ctx))
10632             ngg_emit_nogs_gsthreads(&ctx);
10633       }
10634
10635       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
10636       nir_function_impl *func = nir_shader_get_entrypoint(nir);
10637       bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
10638                           ((nir->info.stage == MESA_SHADER_VERTEX &&
10639                             (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
10640                            (nir->info.stage == MESA_SHADER_TESS_EVAL &&
10641                             ctx.stage == tess_eval_geometry_gs));
10642
10643       bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
10644       bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
10645       if (check_merged_wave_info) {
10646          Temp cond = merged_wave_info_to_mask(&ctx, i);
10647          begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
10648       }
10649
10650       if (i) {
10651          Builder bld(ctx.program, ctx.block);
10652
10653          bld.barrier(aco_opcode::p_memory_barrier_shared);
10654          bld.sopp(aco_opcode::s_barrier);
10655
10656          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
10657             ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
10658          }
10659       } else if (ctx.stage == geometry_gs)
10660          ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id);
10661
10662       if (ctx.stage == fragment_fs)
10663          handle_bc_optimize(&ctx);
10664
10665       visit_cf_list(&ctx, &func->body);
10666
10667       if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
10668          emit_streamout(&ctx, 0);
10669
10670       if (ctx.stage & hw_vs) {
10671          create_vs_exports(&ctx);
10672          ctx.block->kind |= block_kind_export_end;
10673       } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) {
10674          ngg_emit_nogs_output(&ctx);
10675       } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
10676          Builder bld(ctx.program, ctx.block);
10677          bld.barrier(aco_opcode::p_memory_barrier_gs_data);
10678          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
10679       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
10680          write_tcs_tess_factors(&ctx);
10681       }
10682
10683       if (ctx.stage == fragment_fs) {
10684          create_fs_exports(&ctx);
10685          ctx.block->kind |= block_kind_export_end;
10686       }
10687
10688       if (endif_merged_wave_info) {
10689          begin_divergent_if_else(&ctx, &ic_merged_wave_info);
10690          end_divergent_if(&ctx, &ic_merged_wave_info);
10691       }
10692
10693       if (ngg_no_gs && !ngg_early_prim_export(&ctx))
10694          ngg_emit_nogs_output(&ctx);
10695
10696       ralloc_free(ctx.divergent_vals);
10697
10698       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
10699          /* Outputs of the previous stage are inputs to the next stage */
10700          ctx.inputs = ctx.outputs;
10701          ctx.outputs = shader_io_state();
10702       }
10703    }
10704
10705    program->config->float_mode = program->blocks[0].fp_mode.val;
10706
10707    append_logical_end(ctx.block);
10708    ctx.block->kind |= block_kind_uniform;
10709    Builder bld(ctx.program, ctx.block);
10710    if (ctx.program->wb_smem_l1_on_end)
10711       bld.smem(aco_opcode::s_dcache_wb, false);
10712    bld.sopp(aco_opcode::s_endpgm);
10713
10714    cleanup_cfg(program);
10715 }
10716
10717 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
10718                            ac_shader_config* config,
10719                            struct radv_shader_args *args)
10720 {
10721    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
10722
10723    program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
10724    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
10725    program->next_fp_mode.must_flush_denorms32 = false;
10726    program->next_fp_mode.must_flush_denorms16_64 = false;
10727    program->next_fp_mode.care_about_round32 = false;
10728    program->next_fp_mode.care_about_round16_64 = false;
10729    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
10730    program->next_fp_mode.denorm32 = 0;
10731    program->next_fp_mode.round32 = fp_round_ne;
10732    program->next_fp_mode.round16_64 = fp_round_ne;
10733    ctx.block->fp_mode = program->next_fp_mode;
10734
10735    add_startpgm(&ctx);
10736    append_logical_start(ctx.block);
10737
10738    Builder bld(ctx.program, ctx.block);
10739
10740    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
10741
10742    Operand stream_id(0u);
10743    if (args->shader_info->so.num_outputs)
10744       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10745                            get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
10746
10747    Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
10748
10749    std::stack<Block> endif_blocks;
10750
10751    for (unsigned stream = 0; stream < 4; stream++) {
10752       if (stream_id.isConstant() && stream != stream_id.constantValue())
10753          continue;
10754
10755       unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
10756       if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
10757          continue;
10758
10759       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
10760
10761       unsigned BB_if_idx = ctx.block->index;
10762       Block BB_endif = Block();
10763       if (!stream_id.isConstant()) {
10764          /* begin IF */
10765          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
10766          append_logical_end(ctx.block);
10767          ctx.block->kind |= block_kind_uniform;
10768          bld.branch(aco_opcode::p_cbranch_z, cond);
10769
10770          BB_endif.kind |= ctx.block->kind & block_kind_top_level;
10771
10772          ctx.block = ctx.program->create_and_insert_block();
10773          add_edge(BB_if_idx, ctx.block);
10774          bld.reset(ctx.block);
10775          append_logical_start(ctx.block);
10776       }
10777
10778       unsigned offset = 0;
10779       for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
10780          if (args->shader_info->gs.output_streams[i] != stream)
10781             continue;
10782
10783          unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
10784          unsigned length = util_last_bit(output_usage_mask);
10785          for (unsigned j = 0; j < length; ++j) {
10786             if (!(output_usage_mask & (1 << j)))
10787                continue;
10788
10789             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
10790             Temp voffset = vtx_offset;
10791             if (const_offset >= 4096u) {
10792                voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
10793                const_offset %= 4096u;
10794             }
10795
10796             aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
10797             mubuf->definitions[0] = bld.def(v1);
10798             mubuf->operands[0] = Operand(gsvs_ring);
10799             mubuf->operands[1] = Operand(voffset);
10800             mubuf->operands[2] = Operand(0u);
10801             mubuf->offen = true;
10802             mubuf->offset = const_offset;
10803             mubuf->glc = true;
10804             mubuf->slc = true;
10805             mubuf->dlc = args->options->chip_class >= GFX10;
10806             mubuf->barrier = barrier_none;
10807             mubuf->can_reorder = true;
10808
10809             ctx.outputs.mask[i] |= 1 << j;
10810             ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
10811
10812             bld.insert(std::move(mubuf));
10813
10814             offset++;
10815          }
10816       }
10817
10818       if (args->shader_info->so.num_outputs) {
10819          emit_streamout(&ctx, stream);
10820          bld.reset(ctx.block);
10821       }
10822
10823       if (stream == 0) {
10824          create_vs_exports(&ctx);
10825          ctx.block->kind |= block_kind_export_end;
10826       }
10827
10828       if (!stream_id.isConstant()) {
10829          append_logical_end(ctx.block);
10830
10831          /* branch from then block to endif block */
10832          bld.branch(aco_opcode::p_branch);
10833          add_edge(ctx.block->index, &BB_endif);
10834          ctx.block->kind |= block_kind_uniform;
10835
10836          /* emit else block */
10837          ctx.block = ctx.program->create_and_insert_block();
10838          add_edge(BB_if_idx, ctx.block);
10839          bld.reset(ctx.block);
10840          append_logical_start(ctx.block);
10841
10842          endif_blocks.push(std::move(BB_endif));
10843       }
10844    }
10845
10846    while (!endif_blocks.empty()) {
10847       Block BB_endif = std::move(endif_blocks.top());
10848       endif_blocks.pop();
10849
10850       Block *BB_else = ctx.block;
10851
10852       append_logical_end(BB_else);
10853       /* branch from else block to endif block */
10854       bld.branch(aco_opcode::p_branch);
10855       add_edge(BB_else->index, &BB_endif);
10856       BB_else->kind |= block_kind_uniform;
10857
10858       /** emit endif merge block */
10859       ctx.block = program->insert_block(std::move(BB_endif));
10860       bld.reset(ctx.block);
10861       append_logical_start(ctx.block);
10862    }
10863
10864    program->config->float_mode = program->blocks[0].fp_mode.val;
10865
10866    append_logical_end(ctx.block);
10867    ctx.block->kind |= block_kind_uniform;
10868    bld.sopp(aco_opcode::s_endpgm);
10869
10870    cleanup_cfg(program);
10871 }
10872 }