src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <stack>
  29 #include <map>
  30
  31 #include "ac_shader_util.h"
  32 #include "aco_ir.h"
  33 #include "aco_builder.h"
  34 #include "aco_interface.h"
  35 #include "aco_instruction_selection_setup.cpp"
  36 #include "util/fast_idiv_by_const.h"
  37
  38 namespace aco {
  39 namespace {
  40
  41 class loop_info_RAII {
  42    isel_context* ctx;
  43    unsigned header_idx_old;
  44    Block* exit_old;
  45    bool divergent_cont_old;
  46    bool divergent_branch_old;
  47    bool divergent_if_old;
  48
  49 public:
  50    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  51       : ctx(ctx),
  52         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  53         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  54         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  55         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  56    {
  57       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  58       ctx->cf_info.parent_loop.exit = loop_exit;
  59       ctx->cf_info.parent_loop.has_divergent_continue = false;
  60       ctx->cf_info.parent_loop.has_divergent_branch = false;
  61       ctx->cf_info.parent_if.is_divergent = false;
  62       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  63    }
  64
  65    ~loop_info_RAII()
  66    {
  67       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  68       ctx->cf_info.parent_loop.exit = exit_old;
  69       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  70       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  71       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  72       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  73       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  74          ctx->cf_info.exec_potentially_empty_discard = false;
  75    }
  76 };
  77
  78 struct if_context {
  79    Temp cond;
  80
  81    bool divergent_old;
  82    bool exec_potentially_empty_discard_old;
  83    bool exec_potentially_empty_break_old;
  84    uint16_t exec_potentially_empty_break_depth_old;
  85
  86    unsigned BB_if_idx;
  87    unsigned invert_idx;
  88    bool uniform_has_then_branch;
  89    bool then_branch_divergent;
  90    Block BB_invert;
  91    Block BB_endif;
  92 };
  93
  94 static bool visit_cf_list(struct isel_context *ctx,
  95                           struct exec_list *list);
  96
  97 static void add_logical_edge(unsigned pred_idx, Block *succ)
  98 {
  99    succ->logical_preds.emplace_back(pred_idx);
 100 }
 101
 102
 103 static void add_linear_edge(unsigned pred_idx, Block *succ)
 104 {
 105    succ->linear_preds.emplace_back(pred_idx);
 106 }
 107
 108 static void add_edge(unsigned pred_idx, Block *succ)
 109 {
 110    add_logical_edge(pred_idx, succ);
 111    add_linear_edge(pred_idx, succ);
 112 }
 113
 114 static void append_logical_start(Block *b)
 115 {
 116    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 117 }
 118
 119 static void append_logical_end(Block *b)
 120 {
 121    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 122 }
 123
 124 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 125 {
 126    assert(ctx->allocated[def->index].id());
 127    return ctx->allocated[def->index];
 128 }
 129
 130 Temp emit_mbcnt(isel_context *ctx, Definition dst,
 131                 Operand mask_lo = Operand((uint32_t) -1), Operand mask_hi = Operand((uint32_t) -1))
 132 {
 133    Builder bld(ctx->program, ctx->block);
 134    Definition lo_def = ctx->program->wave_size == 32 ? dst : bld.def(v1);
 135    Temp thread_id_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, lo_def, mask_lo, Operand(0u));
 136
 137    if (ctx->program->wave_size == 32) {
 138       return thread_id_lo;
 139    } else if (ctx->program->chip_class <= GFX7) {
 140       Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
 141       return thread_id_hi;
 142    } else {
 143       Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
 144       return thread_id_hi;
 145    }
 146 }
 147
 148 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 149 {
 150    Builder bld(ctx->program, ctx->block);
 151
 152    if (!dst.id())
 153       dst = bld.tmp(src.regClass());
 154
 155    assert(src.size() == dst.size());
 156
 157    if (ctx->stage != fragment_fs) {
 158       if (!dst.id())
 159          return src;
 160
 161       bld.copy(Definition(dst), src);
 162       return dst;
 163    }
 164
 165    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 166    ctx->program->needs_wqm |= program_needs_wqm;
 167    return dst;
 168 }
 169
 170 static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data)
 171 {
 172    if (index.regClass() == s1)
 173       return bld.readlane(bld.def(s1), data, index);
 174
 175    if (ctx->options->chip_class <= GFX7) {
 176       /* GFX6-7: there is no bpermute instruction */
 177       Operand index_op(index);
 178       Operand input_data(data);
 179       index_op.setLateKill(true);
 180       input_data.setLateKill(true);
 181
 182       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
 183    } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
 184       /* GFX10 wave64 mode: emulate full-wave bpermute */
 185       if (!ctx->has_gfx10_wave64_bpermute) {
 186          ctx->has_gfx10_wave64_bpermute = true;
 187          ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
 188          ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
 189       }
 190
 191       Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
 192       Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
 193       Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
 194       Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
 195       Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 196       Operand input_data(data);
 197
 198       index_x4.setLateKill(true);
 199       input_data.setLateKill(true);
 200       same_half.setLateKill(true);
 201
 202       return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
 203    } else {
 204       /* GFX8-9 or GFX10 wave32: bpermute works normally */
 205       Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
 206       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
 207    }
 208 }
 209
 210 static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
 211 {
 212    if (ctx->options->chip_class >= GFX8) {
 213       unsigned and_mask = mask & 0x1f;
 214       unsigned or_mask = (mask >> 5) & 0x1f;
 215       unsigned xor_mask = (mask >> 10) & 0x1f;
 216
 217       uint16_t dpp_ctrl = 0xffff;
 218
 219       // TODO: we could use DPP8 for some swizzles
 220       if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
 221          unsigned res[4] = {0, 1, 2, 3};
 222          for (unsigned i = 0; i < 4; i++)
 223             res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
 224          dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
 225       } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
 226          dpp_ctrl = dpp_row_rr(8);
 227       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
 228          dpp_ctrl = dpp_row_mirror;
 229       } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
 230          dpp_ctrl = dpp_row_half_mirror;
 231       }
 232
 233       if (dpp_ctrl != 0xffff)
 234          return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
 235    }
 236
 237    return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 238 }
 239
 240 Temp as_vgpr(isel_context *ctx, Temp val)
 241 {
 242    if (val.type() == RegType::sgpr) {
 243       Builder bld(ctx->program, ctx->block);
 244       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 245    }
 246    assert(val.type() == RegType::vgpr);
 247    return val;
 248 }
 249
 250 //assumes a != 0xffffffff
 251 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 252 {
 253    assert(b != 0);
 254    Builder bld(ctx->program, ctx->block);
 255
 256    if (util_is_power_of_two_or_zero(b)) {
 257       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 258       return;
 259    }
 260
 261    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 262
 263    assert(info.multiplier <= 0xffffffff);
 264
 265    bool pre_shift = info.pre_shift != 0;
 266    bool increment = info.increment != 0;
 267    bool multiply = true;
 268    bool post_shift = info.post_shift != 0;
 269
 270    if (!pre_shift && !increment && !multiply && !post_shift) {
 271       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 272       return;
 273    }
 274
 275    Temp pre_shift_dst = a;
 276    if (pre_shift) {
 277       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 278       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 279    }
 280
 281    Temp increment_dst = pre_shift_dst;
 282    if (increment) {
 283       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 284       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 285    }
 286
 287    Temp multiply_dst = increment_dst;
 288    if (multiply) {
 289       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 290       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 291                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 292    }
 293
 294    if (post_shift) {
 295       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 296    }
 297 }
 298
 299 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 300 {
 301    Builder bld(ctx->program, ctx->block);
 302    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 303 }
 304
 305
 306 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 307 {
 308    /* no need to extract the whole vector */
 309    if (src.regClass() == dst_rc) {
 310       assert(idx == 0);
 311       return src;
 312    }
 313
 314    assert(src.bytes() > (idx * dst_rc.bytes()));
 315    Builder bld(ctx->program, ctx->block);
 316    auto it = ctx->allocated_vec.find(src.id());
 317    if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) {
 318       if (it->second[idx].regClass() == dst_rc) {
 319          return it->second[idx];
 320       } else {
 321          assert(!dst_rc.is_subdword());
 322          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 323          return bld.copy(bld.def(dst_rc), it->second[idx]);
 324       }
 325    }
 326
 327    if (dst_rc.is_subdword())
 328       src = as_vgpr(ctx, src);
 329
 330    if (src.bytes() == dst_rc.bytes()) {
 331       assert(idx == 0);
 332       return bld.copy(bld.def(dst_rc), src);
 333    } else {
 334       Temp dst = bld.tmp(dst_rc);
 335       emit_extract_vector(ctx, src, idx, dst);
 336       return dst;
 337    }
 338 }
 339
 340 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 341 {
 342    if (num_components == 1)
 343       return;
 344    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 345       return;
 346    RegClass rc;
 347    if (num_components > vec_src.size()) {
 348       if (vec_src.type() == RegType::sgpr) {
 349          /* should still help get_alu_src() */
 350          emit_split_vector(ctx, vec_src, vec_src.size());
 351          return;
 352       }
 353       /* sub-dword split */
 354       rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
 355    } else {
 356       rc = RegClass(vec_src.type(), vec_src.size() / num_components);
 357    }
 358    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 359    split->operands[0] = Operand(vec_src);
 360    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 361    for (unsigned i = 0; i < num_components; i++) {
 362       elems[i] = {ctx->program->allocateId(), rc};
 363       split->definitions[i] = Definition(elems[i]);
 364    }
 365    ctx->block->instructions.emplace_back(std::move(split));
 366    ctx->allocated_vec.emplace(vec_src.id(), elems);
 367 }
 368
 369 /* This vector expansion uses a mask to determine which elements in the new vector
 370  * come from the original vector. The other elements are undefined. */
 371 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 372 {
 373    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 374
 375    if (vec_src == dst)
 376       return;
 377
 378    Builder bld(ctx->program, ctx->block);
 379    if (num_components == 1) {
 380       if (dst.type() == RegType::sgpr)
 381          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 382       else
 383          bld.copy(Definition(dst), vec_src);
 384       return;
 385    }
 386
 387    unsigned component_size = dst.size() / num_components;
 388    std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 389
 390    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 391    vec->definitions[0] = Definition(dst);
 392    unsigned k = 0;
 393    for (unsigned i = 0; i < num_components; i++) {
 394       if (mask & (1 << i)) {
 395          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 396          if (dst.type() == RegType::sgpr)
 397             src = bld.as_uniform(src);
 398          vec->operands[i] = Operand(src);
 399       } else {
 400          vec->operands[i] = Operand(0u);
 401       }
 402       elems[i] = vec->operands[i].getTemp();
 403    }
 404    ctx->block->instructions.emplace_back(std::move(vec));
 405    ctx->allocated_vec.emplace(dst.id(), elems);
 406 }
 407
 408 /* adjust misaligned small bit size loads */
 409 void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
 410 {
 411    Builder bld(ctx->program, ctx->block);
 412    Operand shift;
 413    Temp select = Temp();
 414    if (offset.isConstant()) {
 415       assert(offset.constantValue() && offset.constantValue() < 4);
 416       shift = Operand(offset.constantValue() * 8);
 417    } else {
 418       /* bit_offset = 8 * (offset & 0x3) */
 419       Temp tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand(3u));
 420       select = bld.tmp(s1);
 421       shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, Operand(3u));
 422    }
 423
 424    if (vec.size() == 1) {
 425       bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift);
 426    } else if (vec.size() == 2) {
 427       Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2);
 428       bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift);
 429       if (tmp == dst)
 430          emit_split_vector(ctx, dst, 2);
 431       else
 432          emit_extract_vector(ctx, tmp, 0, dst);
 433    } else if (vec.size() == 4) {
 434       Temp lo = bld.tmp(s2), hi = bld.tmp(s2);
 435       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
 436       hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
 437       if (select != Temp())
 438          hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
 439       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
 440       Temp mid = bld.tmp(s1);
 441       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
 442       hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift);
 443       mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid);
 444       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid);
 445       emit_split_vector(ctx, dst, 2);
 446    }
 447 }
 448
 449 void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 450 {
 451    Builder bld(ctx->program, ctx->block);
 452    if (offset.isTemp()) {
 453       Temp tmp[4] = {vec, vec, vec, vec};
 454
 455       if (vec.size() == 4) {
 456          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
 457          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
 458       } else if (vec.size() == 3) {
 459          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
 460          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
 461       } else if (vec.size() == 2) {
 462          tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
 463          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
 464       }
 465       for (unsigned i = 0; i < dst.size(); i++)
 466          tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
 467
 468       vec = tmp[0];
 469       if (dst.size() == 2)
 470          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
 471
 472       offset = Operand(0u);
 473    }
 474
 475    unsigned num_components = dst.bytes() / component_size;
 476    if (vec.regClass() == dst.regClass()) {
 477       assert(offset.constantValue() == 0);
 478       bld.copy(Definition(dst), vec);
 479       emit_split_vector(ctx, dst, num_components);
 480       return;
 481    }
 482
 483    emit_split_vector(ctx, vec, vec.bytes() / component_size);
 484    std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
 485    RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
 486
 487    assert(offset.constantValue() % component_size == 0);
 488    unsigned skip = offset.constantValue() / component_size;
 489    for (unsigned i = 0; i < num_components; i++)
 490       elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
 491
 492    /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
 493    if (dst.type() == RegType::vgpr) {
 494       aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 495       for (unsigned i = 0; i < num_components; i++)
 496          create_vec->operands[i] = Operand(elems[i]);
 497       create_vec->definitions[0] = Definition(dst);
 498       bld.insert(std::move(create_vec));
 499
 500    /* if dst is sgpr - split the src, but move the original to sgpr. */
 501    } else if (skip) {
 502       vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
 503       byte_align_scalar(ctx, vec, offset, dst);
 504    } else {
 505       assert(dst.size() == vec.size());
 506       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
 507    }
 508
 509    ctx->allocated_vec.emplace(dst.id(), elems);
 510 }
 511
 512 Temp bool_to_vector_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s2))
 513 {
 514    Builder bld(ctx->program, ctx->block);
 515    if (!dst.id())
 516       dst = bld.tmp(bld.lm);
 517
 518    assert(val.regClass() == s1);
 519    assert(dst.regClass() == bld.lm);
 520
 521    return bld.sop2(Builder::s_cselect, Definition(dst), Operand((uint32_t) -1), Operand(0u), bld.scc(val));
 522 }
 523
 524 Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1))
 525 {
 526    Builder bld(ctx->program, ctx->block);
 527    if (!dst.id())
 528       dst = bld.tmp(s1);
 529
 530    assert(val.regClass() == bld.lm);
 531    assert(dst.regClass() == s1);
 532
 533    /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 534    Temp tmp = bld.tmp(s1);
 535    bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm));
 536    return emit_wqm(ctx, tmp, dst);
 537 }
 538
 539 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 540 {
 541    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 542       return get_ssa_temp(ctx, src.src.ssa);
 543
 544    if (src.src.ssa->num_components == size) {
 545       bool identity_swizzle = true;
 546       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 547          if (src.swizzle[i] != i)
 548             identity_swizzle = false;
 549       }
 550       if (identity_swizzle)
 551          return get_ssa_temp(ctx, src.src.ssa);
 552    }
 553
 554    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 555    unsigned elem_size = vec.bytes() / src.src.ssa->num_components;
 556    assert(elem_size > 0);
 557    assert(vec.bytes() % elem_size == 0);
 558
 559    if (elem_size < 4 && vec.type() == RegType::sgpr) {
 560       assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
 561       assert(size == 1);
 562       unsigned swizzle = src.swizzle[0];
 563       if (vec.size() > 1) {
 564          assert(src.src.ssa->bit_size == 16);
 565          vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
 566          swizzle = swizzle & 1;
 567       }
 568       if (swizzle == 0)
 569          return vec;
 570
 571       Temp dst{ctx->program->allocateId(), s1};
 572       aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
 573       bfe->operands[0] = Operand(vec);
 574       bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
 575       bfe->definitions[0] = Definition(dst);
 576       bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 577       ctx->block->instructions.emplace_back(std::move(bfe));
 578       return dst;
 579    }
 580
 581    RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
 582    if (size == 1) {
 583       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 584    } else {
 585       assert(size <= 4);
 586       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
 587       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 588       for (unsigned i = 0; i < size; ++i) {
 589          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 590          vec_instr->operands[i] = Operand{elems[i]};
 591       }
 592       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size / 4)};
 593       vec_instr->definitions[0] = Definition(dst);
 594       ctx->block->instructions.emplace_back(std::move(vec_instr));
 595       ctx->allocated_vec.emplace(dst.id(), elems);
 596       return dst;
 597    }
 598 }
 599
 600 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 601 {
 602    if (ptr.size() == 2)
 603       return ptr;
 604    Builder bld(ctx->program, ctx->block);
 605    if (ptr.type() == RegType::vgpr)
 606       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 607    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 608                      ptr, Operand((unsigned)ctx->options->address32_hi));
 609 }
 610
 611 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 612 {
 613    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 614    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 615    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 616    sop2->definitions[0] = Definition(dst);
 617    if (writes_scc)
 618       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 619    ctx->block->instructions.emplace_back(std::move(sop2));
 620 }
 621
 622 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 623                            bool commutative, bool swap_srcs=false, bool flush_denorms = false)
 624 {
 625    Builder bld(ctx->program, ctx->block);
 626    bld.is_precise = instr->exact;
 627
 628    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 629    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 630    if (src1.type() == RegType::sgpr) {
 631       if (commutative && src0.type() == RegType::vgpr) {
 632          Temp t = src0;
 633          src0 = src1;
 634          src1 = t;
 635       } else {
 636          src1 = as_vgpr(ctx, src1);
 637       }
 638    }
 639
 640    if (flush_denorms && ctx->program->chip_class < GFX9) {
 641       assert(dst.size() == 1);
 642       Temp tmp = bld.vop2(op, bld.def(v1), src0, src1);
 643       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 644    } else {
 645       bld.vop2(op, Definition(dst), src0, src1);
 646    }
 647 }
 648
 649 void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
 650                                    aco_opcode op, Temp dst)
 651 {
 652    Builder bld(ctx->program, ctx->block);
 653    bld.is_precise = instr->exact;
 654
 655    Temp src0 = get_alu_src(ctx, instr->src[0]);
 656    Temp src1 = get_alu_src(ctx, instr->src[1]);
 657
 658    if (src1.type() == RegType::sgpr) {
 659       assert(src0.type() == RegType::vgpr);
 660       std::swap(src0, src1);
 661    }
 662
 663    Temp src00 = bld.tmp(src0.type(), 1);
 664    Temp src01 = bld.tmp(src0.type(), 1);
 665    bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 666    Temp src10 = bld.tmp(v1);
 667    Temp src11 = bld.tmp(v1);
 668    bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 669    Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
 670    Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
 671    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
 672 }
 673
 674 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
 675                             bool flush_denorms = false)
 676 {
 677    Temp src0 = get_alu_src(ctx, instr->src[0]);
 678    Temp src1 = get_alu_src(ctx, instr->src[1]);
 679    Temp src2 = get_alu_src(ctx, instr->src[2]);
 680
 681    /* ensure that the instruction has at most 1 sgpr operand
 682     * The optimizer will inline constants for us */
 683    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 684       src0 = as_vgpr(ctx, src0);
 685    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 686       src1 = as_vgpr(ctx, src1);
 687    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 688       src2 = as_vgpr(ctx, src2);
 689
 690    Builder bld(ctx->program, ctx->block);
 691    bld.is_precise = instr->exact;
 692    if (flush_denorms && ctx->program->chip_class < GFX9) {
 693       assert(dst.size() == 1);
 694       Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
 695       bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand(0x3f800000u), tmp);
 696    } else {
 697       bld.vop3(op, Definition(dst), src0, src1, src2);
 698    }
 699 }
 700
 701 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 702 {
 703    Builder bld(ctx->program, ctx->block);
 704    bld.is_precise = instr->exact;
 705    if (dst.type() == RegType::sgpr)
 706       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
 707                  bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
 708    else
 709       bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 710 }
 711
 712 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 713 {
 714    Temp src0 = get_alu_src(ctx, instr->src[0]);
 715    Temp src1 = get_alu_src(ctx, instr->src[1]);
 716    assert(src0.size() == src1.size());
 717
 718    aco_ptr<Instruction> vopc;
 719    if (src1.type() == RegType::sgpr) {
 720       if (src0.type() == RegType::vgpr) {
 721          /* to swap the operands, we might also have to change the opcode */
 722          switch (op) {
 723             case aco_opcode::v_cmp_lt_f16:
 724                op = aco_opcode::v_cmp_gt_f16;
 725                break;
 726             case aco_opcode::v_cmp_ge_f16:
 727                op = aco_opcode::v_cmp_le_f16;
 728                break;
 729             case aco_opcode::v_cmp_lt_i16:
 730                op = aco_opcode::v_cmp_gt_i16;
 731                break;
 732             case aco_opcode::v_cmp_ge_i16:
 733                op = aco_opcode::v_cmp_le_i16;
 734                break;
 735             case aco_opcode::v_cmp_lt_u16:
 736                op = aco_opcode::v_cmp_gt_u16;
 737                break;
 738             case aco_opcode::v_cmp_ge_u16:
 739                op = aco_opcode::v_cmp_le_u16;
 740                break;
 741             case aco_opcode::v_cmp_lt_f32:
 742                op = aco_opcode::v_cmp_gt_f32;
 743                break;
 744             case aco_opcode::v_cmp_ge_f32:
 745                op = aco_opcode::v_cmp_le_f32;
 746                break;
 747             case aco_opcode::v_cmp_lt_i32:
 748                op = aco_opcode::v_cmp_gt_i32;
 749                break;
 750             case aco_opcode::v_cmp_ge_i32:
 751                op = aco_opcode::v_cmp_le_i32;
 752                break;
 753             case aco_opcode::v_cmp_lt_u32:
 754                op = aco_opcode::v_cmp_gt_u32;
 755                break;
 756             case aco_opcode::v_cmp_ge_u32:
 757                op = aco_opcode::v_cmp_le_u32;
 758                break;
 759             case aco_opcode::v_cmp_lt_f64:
 760                op = aco_opcode::v_cmp_gt_f64;
 761                break;
 762             case aco_opcode::v_cmp_ge_f64:
 763                op = aco_opcode::v_cmp_le_f64;
 764                break;
 765             case aco_opcode::v_cmp_lt_i64:
 766                op = aco_opcode::v_cmp_gt_i64;
 767                break;
 768             case aco_opcode::v_cmp_ge_i64:
 769                op = aco_opcode::v_cmp_le_i64;
 770                break;
 771             case aco_opcode::v_cmp_lt_u64:
 772                op = aco_opcode::v_cmp_gt_u64;
 773                break;
 774             case aco_opcode::v_cmp_ge_u64:
 775                op = aco_opcode::v_cmp_le_u64;
 776                break;
 777             default: /* eq and ne are commutative */
 778                break;
 779          }
 780          Temp t = src0;
 781          src0 = src1;
 782          src1 = t;
 783       } else {
 784          src1 = as_vgpr(ctx, src1);
 785       }
 786    }
 787
 788    Builder bld(ctx->program, ctx->block);
 789    bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1);
 790 }
 791
 792 void emit_sopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 793 {
 794    Temp src0 = get_alu_src(ctx, instr->src[0]);
 795    Temp src1 = get_alu_src(ctx, instr->src[1]);
 796    Builder bld(ctx->program, ctx->block);
 797
 798    assert(dst.regClass() == bld.lm);
 799    assert(src0.type() == RegType::sgpr);
 800    assert(src1.type() == RegType::sgpr);
 801    assert(src0.regClass() == src1.regClass());
 802
 803    /* Emit the SALU comparison instruction */
 804    Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1);
 805    /* Turn the result into a per-lane bool */
 806    bool_to_vector_condition(ctx, cmp, dst);
 807 }
 808
 809 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
 810                      aco_opcode v16_op, aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, aco_opcode s64_op = aco_opcode::num_opcodes)
 811 {
 812    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
 813    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
 814    bool use_valu = s_op == aco_opcode::num_opcodes ||
 815                    nir_dest_is_divergent(instr->dest.dest) ||
 816                    ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
 817                    ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
 818    aco_opcode op = use_valu ? v_op : s_op;
 819    assert(op != aco_opcode::num_opcodes);
 820    assert(dst.regClass() == ctx->program->lane_mask);
 821
 822    if (use_valu)
 823       emit_vopc_instruction(ctx, instr, op, dst);
 824    else
 825       emit_sopc_instruction(ctx, instr, op, dst);
 826 }
 827
 828 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, Builder::WaveSpecificOpcode op, Temp dst)
 829 {
 830    Builder bld(ctx->program, ctx->block);
 831    Temp src0 = get_alu_src(ctx, instr->src[0]);
 832    Temp src1 = get_alu_src(ctx, instr->src[1]);
 833
 834    assert(dst.regClass() == bld.lm);
 835    assert(src0.regClass() == bld.lm);
 836    assert(src1.regClass() == bld.lm);
 837
 838    bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1);
 839 }
 840
 841 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 842 {
 843    Builder bld(ctx->program, ctx->block);
 844    Temp cond = get_alu_src(ctx, instr->src[0]);
 845    Temp then = get_alu_src(ctx, instr->src[1]);
 846    Temp els = get_alu_src(ctx, instr->src[2]);
 847
 848    assert(cond.regClass() == bld.lm);
 849
 850    if (dst.type() == RegType::vgpr) {
 851       aco_ptr<Instruction> bcsel;
 852       if (dst.size() == 1) {
 853          then = as_vgpr(ctx, then);
 854          els = as_vgpr(ctx, els);
 855
 856          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 857       } else if (dst.size() == 2) {
 858          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 859          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 860          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 861          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 862
 863          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 864          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 865
 866          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 867       } else {
 868          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 869          nir_print_instr(&instr->instr, stderr);
 870          fprintf(stderr, "\n");
 871       }
 872       return;
 873    }
 874
 875    if (instr->dest.dest.ssa.bit_size == 1) {
 876       assert(dst.regClass() == bld.lm);
 877       assert(then.regClass() == bld.lm);
 878       assert(els.regClass() == bld.lm);
 879    }
 880
 881    if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
 882       if (dst.regClass() == s1 || dst.regClass() == s2) {
 883          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 884          assert(dst.size() == then.size());
 885          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 886          bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
 887       } else {
 888          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 889          nir_print_instr(&instr->instr, stderr);
 890          fprintf(stderr, "\n");
 891       }
 892       return;
 893    }
 894
 895    /* divergent boolean bcsel
 896     * this implements bcsel on bools: dst = s0 ? s1 : s2
 897     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 898    assert(instr->dest.dest.ssa.bit_size == 1);
 899
 900    if (cond.id() != then.id())
 901       then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then);
 902
 903    if (cond.id() == els.id())
 904       bld.sop1(Builder::s_mov, Definition(dst), then);
 905    else
 906       bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then,
 907                bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond));
 908 }
 909
 910 void emit_scaled_op(isel_context *ctx, Builder& bld, Definition dst, Temp val,
 911                     aco_opcode op, uint32_t undo)
 912 {
 913    /* multiply by 16777216 to handle denormals */
 914    Temp is_denormal = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)),
 915                                as_vgpr(ctx, val), bld.copy(bld.def(v1), Operand((1u << 7) | (1u << 4))));
 916    Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x4b800000u), val);
 917    scaled = bld.vop1(op, bld.def(v1), scaled);
 918    scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(undo), scaled);
 919
 920    Temp not_scaled = bld.vop1(op, bld.def(v1), val);
 921
 922    bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal);
 923 }
 924
 925 void emit_rcp(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 926 {
 927    if (ctx->block->fp_mode.denorm32 == 0) {
 928       bld.vop1(aco_opcode::v_rcp_f32, dst, val);
 929       return;
 930    }
 931
 932    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u);
 933 }
 934
 935 void emit_rsq(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 936 {
 937    if (ctx->block->fp_mode.denorm32 == 0) {
 938       bld.vop1(aco_opcode::v_rsq_f32, dst, val);
 939       return;
 940    }
 941
 942    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u);
 943 }
 944
 945 void emit_sqrt(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 946 {
 947    if (ctx->block->fp_mode.denorm32 == 0) {
 948       bld.vop1(aco_opcode::v_sqrt_f32, dst, val);
 949       return;
 950    }
 951
 952    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u);
 953 }
 954
 955 void emit_log2(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 956 {
 957    if (ctx->block->fp_mode.denorm32 == 0) {
 958       bld.vop1(aco_opcode::v_log_f32, dst, val);
 959       return;
 960    }
 961
 962    emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u);
 963 }
 964
 965 Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
 966 {
 967    if (ctx->options->chip_class >= GFX7)
 968       return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val);
 969
 970    /* GFX6 doesn't support V_TRUNC_F64, lower it. */
 971    /* TODO: create more efficient code! */
 972    if (val.type() == RegType::sgpr)
 973       val = as_vgpr(ctx, val);
 974
 975    /* Split the input value. */
 976    Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1);
 977    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
 978
 979    /* Extract the exponent and compute the unbiased value. */
 980    Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
 981    exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
 982
 983    /* Extract the fractional part. */
 984    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
 985    fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent);
 986
 987    Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1);
 988    bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), fract_mask);
 989
 990    Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1);
 991    Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo);
 992    fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp);
 993    tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi);
 994    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
 995
 996    /* Get the sign bit. */
 997    Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
 998
 999    /* Decide the operation to apply depending on the unbiased exponent. */
1000    Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
1001    Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, bld.copy(bld.def(v1), Operand(0u)), exp_lt0);
1002    Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0);
1003    Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand(51u));
1004    dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51);
1005    dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51);
1006
1007    return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi);
1008 }
1009
1010 Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
1011 {
1012    if (ctx->options->chip_class >= GFX7)
1013       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
1014
1015    /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
1016     * lowered at NIR level for precision reasons). */
1017    Temp src0 = as_vgpr(ctx, val);
1018
1019    Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
1020    Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(-1u), Operand(0x3fefffffu));
1021
1022    Temp isnan = bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask);
1023    Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0);
1024    Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val);
1025
1026    Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
1027    bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0);
1028    Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
1029    bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min);
1030
1031    Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan);
1032    Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan);
1033
1034    Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1);
1035
1036    Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v);
1037    static_cast<VOP3A_instruction*>(add)->neg[1] = true;
1038
1039    return add->definitions[0].getTemp();
1040 }
1041
1042 Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
1043    if (!dst.id()) {
1044       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
1045          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
1046       else
1047          dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
1048    }
1049
1050    if (dst.bytes() == src.bytes() && dst_bits < src_bits)
1051       return bld.copy(Definition(dst), src);
1052    else if (dst.bytes() < src.bytes())
1053       return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
1054
1055    Temp tmp = dst;
1056    if (dst_bits == 64)
1057       tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
1058
1059    if (tmp == src) {
1060    } else if (src.regClass() == s1) {
1061       if (is_signed)
1062          bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
1063       else
1064          bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
1065    } else if (ctx->options->chip_class >= GFX8) {
1066       assert(src_bits != 8 || src.regClass() == v1b);
1067       assert(src_bits != 16 || src.regClass() == v2b);
1068       aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
1069       sdwa->operands[0] = Operand(src);
1070       sdwa->definitions[0] = Definition(tmp);
1071       if (is_signed)
1072          sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
1073       else
1074          sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
1075       sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
1076       bld.insert(std::move(sdwa));
1077    } else {
1078       assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
1079       aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
1080       bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
1081    }
1082
1083    if (dst_bits == 64) {
1084       if (is_signed && dst.regClass() == s2) {
1085          Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
1086          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1087       } else if (is_signed && dst.regClass() == v2) {
1088          Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
1089          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
1090       } else {
1091          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
1092       }
1093    }
1094
1095    return dst;
1096 }
1097
1098 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
1099 {
1100    if (!instr->dest.dest.is_ssa) {
1101       fprintf(stderr, "nir alu dst not in ssa: ");
1102       nir_print_instr(&instr->instr, stderr);
1103       fprintf(stderr, "\n");
1104       abort();
1105    }
1106    Builder bld(ctx->program, ctx->block);
1107    bld.is_precise = instr->exact;
1108    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
1109    switch(instr->op) {
1110    case nir_op_vec2:
1111    case nir_op_vec3:
1112    case nir_op_vec4: {
1113       std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
1114       unsigned num = instr->dest.dest.ssa.num_components;
1115       for (unsigned i = 0; i < num; ++i)
1116          elems[i] = get_alu_src(ctx, instr->src[i]);
1117
1118       if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
1119          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
1120          RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
1121          for (unsigned i = 0; i < num; ++i) {
1122             if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
1123                vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
1124             else
1125                vec->operands[i] = Operand{elems[i]};
1126          }
1127          vec->definitions[0] = Definition(dst);
1128          ctx->block->instructions.emplace_back(std::move(vec));
1129          ctx->allocated_vec.emplace(dst.id(), elems);
1130       } else {
1131          // TODO: that is a bit suboptimal..
1132          Temp mask = bld.copy(bld.def(s1), Operand((1u << instr->dest.dest.ssa.bit_size) - 1));
1133          for (unsigned i = 0; i < num - 1; ++i)
1134             if (((i+1) * instr->dest.dest.ssa.bit_size) % 32)
1135                elems[i] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask);
1136          for (unsigned i = 0; i < num; ++i) {
1137             unsigned bit = i * instr->dest.dest.ssa.bit_size;
1138             if (bit % 32 == 0) {
1139                elems[bit / 32] = elems[i];
1140             } else {
1141                elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc),
1142                                    elems[i], Operand((i * instr->dest.dest.ssa.bit_size) % 32));
1143                elems[bit / 32] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[bit / 32], elems[i]);
1144             }
1145          }
1146          if (dst.size() == 1)
1147             bld.copy(Definition(dst), elems[0]);
1148          else
1149             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), elems[0], elems[1]);
1150       }
1151       break;
1152    }
1153    case nir_op_mov: {
1154       Temp src = get_alu_src(ctx, instr->src[0]);
1155       aco_ptr<Instruction> mov;
1156       if (dst.type() == RegType::sgpr) {
1157          if (src.type() == RegType::vgpr)
1158             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
1159          else if (src.regClass() == s1)
1160             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
1161          else if (src.regClass() == s2)
1162             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
1163          else
1164             unreachable("wrong src register class for nir_op_imov");
1165       } else {
1166          if (dst.regClass() == v1)
1167             bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
1168          else if (dst.regClass() == v1b ||
1169                   dst.regClass() == v2b ||
1170                   dst.regClass() == v2)
1171             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
1172          else
1173             unreachable("wrong src register class for nir_op_imov");
1174       }
1175       break;
1176    }
1177    case nir_op_inot: {
1178       Temp src = get_alu_src(ctx, instr->src[0]);
1179       if (instr->dest.dest.ssa.bit_size == 1) {
1180          assert(src.regClass() == bld.lm);
1181          assert(dst.regClass() == bld.lm);
1182          /* Don't use s_andn2 here, this allows the optimizer to make a better decision */
1183          Temp tmp = bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), src);
1184          bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
1185       } else if (dst.regClass() == v1) {
1186          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
1187       } else if (dst.regClass() == v2) {
1188          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
1189          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
1190          lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
1191          hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
1192          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
1193       } else if (dst.type() == RegType::sgpr) {
1194          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
1195          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
1196       } else {
1197          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1198          nir_print_instr(&instr->instr, stderr);
1199          fprintf(stderr, "\n");
1200       }
1201       break;
1202    }
1203    case nir_op_ineg: {
1204       Temp src = get_alu_src(ctx, instr->src[0]);
1205       if (dst.regClass() == v1) {
1206          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
1207       } else if (dst.regClass() == s1) {
1208          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
1209       } else if (dst.size() == 2) {
1210          Temp src0 = bld.tmp(dst.type(), 1);
1211          Temp src1 = bld.tmp(dst.type(), 1);
1212          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
1213
1214          if (dst.regClass() == s2) {
1215             Temp carry = bld.tmp(s1);
1216             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
1217             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
1218             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1219          } else {
1220             Temp lower = bld.tmp(v1);
1221             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
1222             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
1223             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1224          }
1225       } else {
1226          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1227          nir_print_instr(&instr->instr, stderr);
1228          fprintf(stderr, "\n");
1229       }
1230       break;
1231    }
1232    case nir_op_iabs: {
1233       if (dst.regClass() == s1) {
1234          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
1235       } else if (dst.regClass() == v1) {
1236          Temp src = get_alu_src(ctx, instr->src[0]);
1237          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
1238       } else {
1239          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1240          nir_print_instr(&instr->instr, stderr);
1241          fprintf(stderr, "\n");
1242       }
1243       break;
1244    }
1245    case nir_op_isign: {
1246       Temp src = get_alu_src(ctx, instr->src[0]);
1247       if (dst.regClass() == s1) {
1248          Temp tmp = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand((uint32_t)-1));
1249          bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand(1u));
1250       } else if (dst.regClass() == s2) {
1251          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
1252          Temp neqz;
1253          if (ctx->program->chip_class >= GFX8)
1254             neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
1255          else
1256             neqz = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand(0u)).def(1).getTemp();
1257          /* SCC gets zero-extended to 64 bit */
1258          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz));
1259       } else if (dst.regClass() == v1) {
1260          bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand((uint32_t)-1), src, Operand(1u));
1261       } else if (dst.regClass() == v2) {
1262          Temp upper = emit_extract_vector(ctx, src, 1, v1);
1263          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
1264          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
1265          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
1266          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
1267          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1268       } else {
1269          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1270          nir_print_instr(&instr->instr, stderr);
1271          fprintf(stderr, "\n");
1272       }
1273       break;
1274    }
1275    case nir_op_imax: {
1276       if (dst.regClass() == v1) {
1277          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
1278       } else if (dst.regClass() == s1) {
1279          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
1280       } else {
1281          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1282          nir_print_instr(&instr->instr, stderr);
1283          fprintf(stderr, "\n");
1284       }
1285       break;
1286    }
1287    case nir_op_umax: {
1288       if (dst.regClass() == v1) {
1289          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
1290       } else if (dst.regClass() == s1) {
1291          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
1292       } else {
1293          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1294          nir_print_instr(&instr->instr, stderr);
1295          fprintf(stderr, "\n");
1296       }
1297       break;
1298    }
1299    case nir_op_imin: {
1300       if (dst.regClass() == v1) {
1301          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
1302       } else if (dst.regClass() == s1) {
1303          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
1304       } else {
1305          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1306          nir_print_instr(&instr->instr, stderr);
1307          fprintf(stderr, "\n");
1308       }
1309       break;
1310    }
1311    case nir_op_umin: {
1312       if (dst.regClass() == v1) {
1313          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
1314       } else if (dst.regClass() == s1) {
1315          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
1316       } else {
1317          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1318          nir_print_instr(&instr->instr, stderr);
1319          fprintf(stderr, "\n");
1320       }
1321       break;
1322    }
1323    case nir_op_ior: {
1324       if (instr->dest.dest.ssa.bit_size == 1) {
1325          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
1326       } else if (dst.regClass() == v1) {
1327          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
1328       } else if (dst.regClass() == v2) {
1329          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
1330       } else if (dst.regClass() == s1) {
1331          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
1332       } else if (dst.regClass() == s2) {
1333          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
1334       } else {
1335          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1336          nir_print_instr(&instr->instr, stderr);
1337          fprintf(stderr, "\n");
1338       }
1339       break;
1340    }
1341    case nir_op_iand: {
1342       if (instr->dest.dest.ssa.bit_size == 1) {
1343          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
1344       } else if (dst.regClass() == v1) {
1345          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
1346       } else if (dst.regClass() == v2) {
1347          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
1348       } else if (dst.regClass() == s1) {
1349          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
1350       } else if (dst.regClass() == s2) {
1351          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
1352       } else {
1353          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1354          nir_print_instr(&instr->instr, stderr);
1355          fprintf(stderr, "\n");
1356       }
1357       break;
1358    }
1359    case nir_op_ixor: {
1360       if (instr->dest.dest.ssa.bit_size == 1) {
1361          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
1362       } else if (dst.regClass() == v1) {
1363          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
1364       } else if (dst.regClass() == v2) {
1365          emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
1366       } else if (dst.regClass() == s1) {
1367          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
1368       } else if (dst.regClass() == s2) {
1369          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
1370       } else {
1371          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1372          nir_print_instr(&instr->instr, stderr);
1373          fprintf(stderr, "\n");
1374       }
1375       break;
1376    }
1377    case nir_op_ushr: {
1378       if (dst.regClass() == v1) {
1379          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
1380       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1381          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
1382                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1383       } else if (dst.regClass() == v2) {
1384          bld.vop3(aco_opcode::v_lshr_b64, Definition(dst),
1385                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1386       } else if (dst.regClass() == s2) {
1387          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
1388       } else if (dst.regClass() == s1) {
1389          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
1390       } else {
1391          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1392          nir_print_instr(&instr->instr, stderr);
1393          fprintf(stderr, "\n");
1394       }
1395       break;
1396    }
1397    case nir_op_ishl: {
1398       if (dst.regClass() == v1) {
1399          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
1400       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1401          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
1402                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1403       } else if (dst.regClass() == v2) {
1404          bld.vop3(aco_opcode::v_lshl_b64, Definition(dst),
1405                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1406       } else if (dst.regClass() == s1) {
1407          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
1408       } else if (dst.regClass() == s2) {
1409          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
1410       } else {
1411          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1412          nir_print_instr(&instr->instr, stderr);
1413          fprintf(stderr, "\n");
1414       }
1415       break;
1416    }
1417    case nir_op_ishr: {
1418       if (dst.regClass() == v1) {
1419          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
1420       } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) {
1421          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
1422                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
1423       } else if (dst.regClass() == v2) {
1424          bld.vop3(aco_opcode::v_ashr_i64, Definition(dst),
1425                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1426       } else if (dst.regClass() == s1) {
1427          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
1428       } else if (dst.regClass() == s2) {
1429          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
1430       } else {
1431          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1432          nir_print_instr(&instr->instr, stderr);
1433          fprintf(stderr, "\n");
1434       }
1435       break;
1436    }
1437    case nir_op_find_lsb: {
1438       Temp src = get_alu_src(ctx, instr->src[0]);
1439       if (src.regClass() == s1) {
1440          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
1441       } else if (src.regClass() == v1) {
1442          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
1443       } else if (src.regClass() == s2) {
1444          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
1445       } else {
1446          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1447          nir_print_instr(&instr->instr, stderr);
1448          fprintf(stderr, "\n");
1449       }
1450       break;
1451    }
1452    case nir_op_ufind_msb:
1453    case nir_op_ifind_msb: {
1454       Temp src = get_alu_src(ctx, instr->src[0]);
1455       if (src.regClass() == s1 || src.regClass() == s2) {
1456          aco_opcode op = src.regClass() == s2 ?
1457                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
1458                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
1459          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
1460
1461          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
1462                                         Operand(src.size() * 32u - 1u), msb_rev);
1463          Temp msb = sub.def(0).getTemp();
1464          Temp carry = sub.def(1).getTemp();
1465
1466          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, bld.scc(carry));
1467       } else if (src.regClass() == v1) {
1468          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
1469          Temp msb_rev = bld.tmp(v1);
1470          emit_vop1_instruction(ctx, instr, op, msb_rev);
1471          Temp msb = bld.tmp(v1);
1472          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
1473          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
1474       } else {
1475          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1476          nir_print_instr(&instr->instr, stderr);
1477          fprintf(stderr, "\n");
1478       }
1479       break;
1480    }
1481    case nir_op_bitfield_reverse: {
1482       if (dst.regClass() == s1) {
1483          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1484       } else if (dst.regClass() == v1) {
1485          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
1486       } else {
1487          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1488          nir_print_instr(&instr->instr, stderr);
1489          fprintf(stderr, "\n");
1490       }
1491       break;
1492    }
1493    case nir_op_iadd: {
1494       if (dst.regClass() == s1) {
1495          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
1496          break;
1497       }
1498
1499       Temp src0 = get_alu_src(ctx, instr->src[0]);
1500       Temp src1 = get_alu_src(ctx, instr->src[1]);
1501       if (dst.regClass() == v1) {
1502          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
1503          break;
1504       }
1505
1506       assert(src0.size() == 2 && src1.size() == 2);
1507       Temp src00 = bld.tmp(src0.type(), 1);
1508       Temp src01 = bld.tmp(dst.type(), 1);
1509       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1510       Temp src10 = bld.tmp(src1.type(), 1);
1511       Temp src11 = bld.tmp(dst.type(), 1);
1512       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1513
1514       if (dst.regClass() == s2) {
1515          Temp carry = bld.tmp(s1);
1516          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1517          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
1518          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1519       } else if (dst.regClass() == v2) {
1520          Temp dst0 = bld.tmp(v1);
1521          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
1522          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
1523          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1524       } else {
1525          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1526          nir_print_instr(&instr->instr, stderr);
1527          fprintf(stderr, "\n");
1528       }
1529       break;
1530    }
1531    case nir_op_uadd_sat: {
1532       Temp src0 = get_alu_src(ctx, instr->src[0]);
1533       Temp src1 = get_alu_src(ctx, instr->src[1]);
1534       if (dst.regClass() == s1) {
1535          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1536          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1537                   src0, src1);
1538          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1539       } else if (dst.regClass() == v1) {
1540          if (ctx->options->chip_class >= GFX9) {
1541             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1542             add->operands[0] = Operand(src0);
1543             add->operands[1] = Operand(src1);
1544             add->definitions[0] = Definition(dst);
1545             add->clamp = 1;
1546             ctx->block->instructions.emplace_back(std::move(add));
1547          } else {
1548             if (src1.regClass() != v1)
1549                std::swap(src0, src1);
1550             assert(src1.regClass() == v1);
1551             Temp tmp = bld.tmp(v1);
1552             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1553             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1554          }
1555       } else {
1556          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1557          nir_print_instr(&instr->instr, stderr);
1558          fprintf(stderr, "\n");
1559       }
1560       break;
1561    }
1562    case nir_op_uadd_carry: {
1563       Temp src0 = get_alu_src(ctx, instr->src[0]);
1564       Temp src1 = get_alu_src(ctx, instr->src[1]);
1565       if (dst.regClass() == s1) {
1566          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1567          break;
1568       }
1569       if (dst.regClass() == v1) {
1570          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1571          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1572          break;
1573       }
1574
1575       Temp src00 = bld.tmp(src0.type(), 1);
1576       Temp src01 = bld.tmp(dst.type(), 1);
1577       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1578       Temp src10 = bld.tmp(src1.type(), 1);
1579       Temp src11 = bld.tmp(dst.type(), 1);
1580       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1581       if (dst.regClass() == s2) {
1582          Temp carry = bld.tmp(s1);
1583          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1584          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1585          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1586       } else if (dst.regClass() == v2) {
1587          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1588          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1589          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1590          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1591       } else {
1592          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1593          nir_print_instr(&instr->instr, stderr);
1594          fprintf(stderr, "\n");
1595       }
1596       break;
1597    }
1598    case nir_op_isub: {
1599       if (dst.regClass() == s1) {
1600          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1601          break;
1602       }
1603
1604       Temp src0 = get_alu_src(ctx, instr->src[0]);
1605       Temp src1 = get_alu_src(ctx, instr->src[1]);
1606       if (dst.regClass() == v1) {
1607          bld.vsub32(Definition(dst), src0, src1);
1608          break;
1609       }
1610
1611       Temp src00 = bld.tmp(src0.type(), 1);
1612       Temp src01 = bld.tmp(dst.type(), 1);
1613       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1614       Temp src10 = bld.tmp(src1.type(), 1);
1615       Temp src11 = bld.tmp(dst.type(), 1);
1616       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1617       if (dst.regClass() == s2) {
1618          Temp carry = bld.tmp(s1);
1619          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1620          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1621          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1622       } else if (dst.regClass() == v2) {
1623          Temp lower = bld.tmp(v1);
1624          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1625          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1626          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1627       } else {
1628          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1629          nir_print_instr(&instr->instr, stderr);
1630          fprintf(stderr, "\n");
1631       }
1632       break;
1633    }
1634    case nir_op_usub_borrow: {
1635       Temp src0 = get_alu_src(ctx, instr->src[0]);
1636       Temp src1 = get_alu_src(ctx, instr->src[1]);
1637       if (dst.regClass() == s1) {
1638          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1639          break;
1640       } else if (dst.regClass() == v1) {
1641          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1642          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1643          break;
1644       }
1645
1646       Temp src00 = bld.tmp(src0.type(), 1);
1647       Temp src01 = bld.tmp(dst.type(), 1);
1648       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1649       Temp src10 = bld.tmp(src1.type(), 1);
1650       Temp src11 = bld.tmp(dst.type(), 1);
1651       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1652       if (dst.regClass() == s2) {
1653          Temp borrow = bld.tmp(s1);
1654          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1655          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1656          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1657       } else if (dst.regClass() == v2) {
1658          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1659          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1660          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1661          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1662       } else {
1663          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1664          nir_print_instr(&instr->instr, stderr);
1665          fprintf(stderr, "\n");
1666       }
1667       break;
1668    }
1669    case nir_op_imul: {
1670       if (dst.regClass() == v1) {
1671          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1672                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1673       } else if (dst.regClass() == s1) {
1674          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1675       } else {
1676          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1677          nir_print_instr(&instr->instr, stderr);
1678          fprintf(stderr, "\n");
1679       }
1680       break;
1681    }
1682    case nir_op_umul_high: {
1683       if (dst.regClass() == v1) {
1684          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1685       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1686          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1687       } else if (dst.regClass() == s1) {
1688          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1689                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1690          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1691       } else {
1692          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1693          nir_print_instr(&instr->instr, stderr);
1694          fprintf(stderr, "\n");
1695       }
1696       break;
1697    }
1698    case nir_op_imul_high: {
1699       if (dst.regClass() == v1) {
1700          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1701       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1702          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1703       } else if (dst.regClass() == s1) {
1704          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1705                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1706          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1707       } else {
1708          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1709          nir_print_instr(&instr->instr, stderr);
1710          fprintf(stderr, "\n");
1711       }
1712       break;
1713    }
1714    case nir_op_fmul: {
1715       Temp src0 = get_alu_src(ctx, instr->src[0]);
1716       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1717       if (dst.regClass() == v2b) {
1718          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
1719       } else if (dst.regClass() == v1) {
1720          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1721       } else if (dst.regClass() == v2) {
1722          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
1723       } else {
1724          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1725          nir_print_instr(&instr->instr, stderr);
1726          fprintf(stderr, "\n");
1727       }
1728       break;
1729    }
1730    case nir_op_fadd: {
1731       Temp src0 = get_alu_src(ctx, instr->src[0]);
1732       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1733       if (dst.regClass() == v2b) {
1734          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
1735       } else if (dst.regClass() == v1) {
1736          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1737       } else if (dst.regClass() == v2) {
1738          bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
1739       } else {
1740          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1741          nir_print_instr(&instr->instr, stderr);
1742          fprintf(stderr, "\n");
1743       }
1744       break;
1745    }
1746    case nir_op_fsub: {
1747       Temp src0 = get_alu_src(ctx, instr->src[0]);
1748       Temp src1 = get_alu_src(ctx, instr->src[1]);
1749       if (dst.regClass() == v2b) {
1750          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1751             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
1752          else
1753             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
1754       } else if (dst.regClass() == v1) {
1755          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1756             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1757          else
1758             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1759       } else if (dst.regClass() == v2) {
1760          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1761                                      as_vgpr(ctx, src0), as_vgpr(ctx, src1));
1762          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1763          sub->neg[1] = true;
1764       } else {
1765          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1766          nir_print_instr(&instr->instr, stderr);
1767          fprintf(stderr, "\n");
1768       }
1769       break;
1770    }
1771    case nir_op_fmax: {
1772       Temp src0 = get_alu_src(ctx, instr->src[0]);
1773       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1774       if (dst.regClass() == v2b) {
1775          // TODO: check fp_mode.must_flush_denorms16_64
1776          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
1777       } else if (dst.regClass() == v1) {
1778          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1779       } else if (dst.regClass() == v2) {
1780          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1781             Temp tmp = bld.vop3(aco_opcode::v_max_f64, bld.def(v2), src0, src1);
1782             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1783          } else {
1784             bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
1785          }
1786       } else {
1787          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1788          nir_print_instr(&instr->instr, stderr);
1789          fprintf(stderr, "\n");
1790       }
1791       break;
1792    }
1793    case nir_op_fmin: {
1794       Temp src0 = get_alu_src(ctx, instr->src[0]);
1795       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
1796       if (dst.regClass() == v2b) {
1797          // TODO: check fp_mode.must_flush_denorms16_64
1798          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
1799       } else if (dst.regClass() == v1) {
1800          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
1801       } else if (dst.regClass() == v2) {
1802          if (ctx->block->fp_mode.must_flush_denorms16_64 && ctx->program->chip_class < GFX9) {
1803             Temp tmp = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), src0, src1);
1804             bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand(0x3FF0000000000000lu), tmp);
1805          } else {
1806             bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
1807          }
1808       } else {
1809          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1810          nir_print_instr(&instr->instr, stderr);
1811          fprintf(stderr, "\n");
1812       }
1813       break;
1814    }
1815    case nir_op_fmax3: {
1816       if (dst.regClass() == v2b) {
1817          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
1818       } else if (dst.regClass() == v1) {
1819          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1820       } else {
1821          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1822          nir_print_instr(&instr->instr, stderr);
1823          fprintf(stderr, "\n");
1824       }
1825       break;
1826    }
1827    case nir_op_fmin3: {
1828       if (dst.regClass() == v2b) {
1829          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
1830       } else if (dst.regClass() == v1) {
1831          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1832       } else {
1833          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1834          nir_print_instr(&instr->instr, stderr);
1835          fprintf(stderr, "\n");
1836       }
1837       break;
1838    }
1839    case nir_op_fmed3: {
1840       if (dst.regClass() == v2b) {
1841          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
1842       } else if (dst.regClass() == v1) {
1843          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
1844       } else {
1845          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1846          nir_print_instr(&instr->instr, stderr);
1847          fprintf(stderr, "\n");
1848       }
1849       break;
1850    }
1851    case nir_op_umax3: {
1852       if (dst.size() == 1) {
1853          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1854       } else {
1855          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1856          nir_print_instr(&instr->instr, stderr);
1857          fprintf(stderr, "\n");
1858       }
1859       break;
1860    }
1861    case nir_op_umin3: {
1862       if (dst.size() == 1) {
1863          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1864       } else {
1865          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1866          nir_print_instr(&instr->instr, stderr);
1867          fprintf(stderr, "\n");
1868       }
1869       break;
1870    }
1871    case nir_op_umed3: {
1872       if (dst.size() == 1) {
1873          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1874       } else {
1875          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1876          nir_print_instr(&instr->instr, stderr);
1877          fprintf(stderr, "\n");
1878       }
1879       break;
1880    }
1881    case nir_op_imax3: {
1882       if (dst.size() == 1) {
1883          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1884       } else {
1885          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1886          nir_print_instr(&instr->instr, stderr);
1887          fprintf(stderr, "\n");
1888       }
1889       break;
1890    }
1891    case nir_op_imin3: {
1892       if (dst.size() == 1) {
1893          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1894       } else {
1895          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1896          nir_print_instr(&instr->instr, stderr);
1897          fprintf(stderr, "\n");
1898       }
1899       break;
1900    }
1901    case nir_op_imed3: {
1902       if (dst.size() == 1) {
1903          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1904       } else {
1905          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1906          nir_print_instr(&instr->instr, stderr);
1907          fprintf(stderr, "\n");
1908       }
1909       break;
1910    }
1911    case nir_op_cube_face_coord: {
1912       Temp in = get_alu_src(ctx, instr->src[0], 3);
1913       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1914                       emit_extract_vector(ctx, in, 1, v1),
1915                       emit_extract_vector(ctx, in, 2, v1) };
1916       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1917       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1918       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1919       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1920       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1921       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1922       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1923       break;
1924    }
1925    case nir_op_cube_face_index: {
1926       Temp in = get_alu_src(ctx, instr->src[0], 3);
1927       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1928                       emit_extract_vector(ctx, in, 1, v1),
1929                       emit_extract_vector(ctx, in, 2, v1) };
1930       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1931       break;
1932    }
1933    case nir_op_bcsel: {
1934       emit_bcsel(ctx, instr, dst);
1935       break;
1936    }
1937    case nir_op_frsq: {
1938       Temp src = get_alu_src(ctx, instr->src[0]);
1939       if (dst.regClass() == v2b) {
1940          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
1941       } else if (dst.regClass() == v1) {
1942          emit_rsq(ctx, bld, Definition(dst), src);
1943       } else if (dst.regClass() == v2) {
1944          /* Lowered at NIR level for precision reasons. */
1945          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1946       } else {
1947          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1948          nir_print_instr(&instr->instr, stderr);
1949          fprintf(stderr, "\n");
1950       }
1951       break;
1952    }
1953    case nir_op_fneg: {
1954       Temp src = get_alu_src(ctx, instr->src[0]);
1955       if (dst.regClass() == v2b) {
1956          if (ctx->block->fp_mode.must_flush_denorms16_64)
1957             src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1958          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
1959       } else if (dst.regClass() == v1) {
1960          if (ctx->block->fp_mode.must_flush_denorms32)
1961             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1962          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1963       } else if (dst.regClass() == v2) {
1964          if (ctx->block->fp_mode.must_flush_denorms16_64)
1965             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1966          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1967          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1968          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1969          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1970       } else {
1971          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1972          nir_print_instr(&instr->instr, stderr);
1973          fprintf(stderr, "\n");
1974       }
1975       break;
1976    }
1977    case nir_op_fabs: {
1978       Temp src = get_alu_src(ctx, instr->src[0]);
1979       if (dst.regClass() == v2b) {
1980          if (ctx->block->fp_mode.must_flush_denorms16_64)
1981             src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
1982          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
1983       } else if (dst.regClass() == v1) {
1984          if (ctx->block->fp_mode.must_flush_denorms32)
1985             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
1986          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1987       } else if (dst.regClass() == v2) {
1988          if (ctx->block->fp_mode.must_flush_denorms16_64)
1989             src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand(0x3FF0000000000000lu), as_vgpr(ctx, src));
1990          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1991          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1992          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1993          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1994       } else {
1995          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1996          nir_print_instr(&instr->instr, stderr);
1997          fprintf(stderr, "\n");
1998       }
1999       break;
2000    }
2001    case nir_op_fsat: {
2002       Temp src = get_alu_src(ctx, instr->src[0]);
2003       if (dst.regClass() == v2b) {
2004          bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
2005       } else if (dst.regClass() == v1) {
2006          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2007          /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
2008          // TODO: confirm that this holds under any circumstances
2009       } else if (dst.regClass() == v2) {
2010          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
2011          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
2012          vop3->clamp = true;
2013       } else {
2014          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2015          nir_print_instr(&instr->instr, stderr);
2016          fprintf(stderr, "\n");
2017       }
2018       break;
2019    }
2020    case nir_op_flog2: {
2021       Temp src = get_alu_src(ctx, instr->src[0]);
2022       if (dst.regClass() == v2b) {
2023          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
2024       } else if (dst.regClass() == v1) {
2025          emit_log2(ctx, bld, Definition(dst), src);
2026       } else {
2027          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2028          nir_print_instr(&instr->instr, stderr);
2029          fprintf(stderr, "\n");
2030       }
2031       break;
2032    }
2033    case nir_op_frcp: {
2034       Temp src = get_alu_src(ctx, instr->src[0]);
2035       if (dst.regClass() == v2b) {
2036          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
2037       } else if (dst.regClass() == v1) {
2038          emit_rcp(ctx, bld, Definition(dst), src);
2039       } else if (dst.regClass() == v2) {
2040          /* Lowered at NIR level for precision reasons. */
2041          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
2042       } else {
2043          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2044          nir_print_instr(&instr->instr, stderr);
2045          fprintf(stderr, "\n");
2046       }
2047       break;
2048    }
2049    case nir_op_fexp2: {
2050       if (dst.regClass() == v2b) {
2051          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
2052       } else if (dst.regClass() == v1) {
2053          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
2054       } else {
2055          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2056          nir_print_instr(&instr->instr, stderr);
2057          fprintf(stderr, "\n");
2058       }
2059       break;
2060    }
2061    case nir_op_fsqrt: {
2062       Temp src = get_alu_src(ctx, instr->src[0]);
2063       if (dst.regClass() == v2b) {
2064          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
2065       } else if (dst.regClass() == v1) {
2066          emit_sqrt(ctx, bld, Definition(dst), src);
2067       } else if (dst.regClass() == v2) {
2068          /* Lowered at NIR level for precision reasons. */
2069          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
2070       } else {
2071          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2072          nir_print_instr(&instr->instr, stderr);
2073          fprintf(stderr, "\n");
2074       }
2075       break;
2076    }
2077    case nir_op_ffract: {
2078       if (dst.regClass() == v2b) {
2079          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
2080       } else if (dst.regClass() == v1) {
2081          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
2082       } else if (dst.regClass() == v2) {
2083          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
2084       } else {
2085          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2086          nir_print_instr(&instr->instr, stderr);
2087          fprintf(stderr, "\n");
2088       }
2089       break;
2090    }
2091    case nir_op_ffloor: {
2092       Temp src = get_alu_src(ctx, instr->src[0]);
2093       if (dst.regClass() == v2b) {
2094          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
2095       } else if (dst.regClass() == v1) {
2096          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
2097       } else if (dst.regClass() == v2) {
2098          emit_floor_f64(ctx, bld, Definition(dst), src);
2099       } else {
2100          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2101          nir_print_instr(&instr->instr, stderr);
2102          fprintf(stderr, "\n");
2103       }
2104       break;
2105    }
2106    case nir_op_fceil: {
2107       Temp src0 = get_alu_src(ctx, instr->src[0]);
2108       if (dst.regClass() == v2b) {
2109          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
2110       } else if (dst.regClass() == v1) {
2111          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
2112       } else if (dst.regClass() == v2) {
2113          if (ctx->options->chip_class >= GFX7) {
2114             emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
2115          } else {
2116             /* GFX6 doesn't support V_CEIL_F64, lower it. */
2117             /* trunc = trunc(src0)
2118              * if (src0 > 0.0 && src0 != trunc)
2119              *    trunc += 1.0
2120              */
2121             Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0);
2122             Temp tmp0 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand(0u));
2123             Temp tmp1 = bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc);
2124             Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), tmp0, tmp1);
2125             Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand(0u)), bld.copy(bld.def(v1), Operand(0x3ff00000u)), cond);
2126             add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), bld.copy(bld.def(v1), Operand(0u)), add);
2127             bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
2128          }
2129       } else {
2130          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2131          nir_print_instr(&instr->instr, stderr);
2132          fprintf(stderr, "\n");
2133       }
2134       break;
2135    }
2136    case nir_op_ftrunc: {
2137       Temp src = get_alu_src(ctx, instr->src[0]);
2138       if (dst.regClass() == v2b) {
2139          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
2140       } else if (dst.regClass() == v1) {
2141          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
2142       } else if (dst.regClass() == v2) {
2143          emit_trunc_f64(ctx, bld, Definition(dst), src);
2144       } else {
2145          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2146          nir_print_instr(&instr->instr, stderr);
2147          fprintf(stderr, "\n");
2148       }
2149       break;
2150    }
2151    case nir_op_fround_even: {
2152       Temp src0 = get_alu_src(ctx, instr->src[0]);
2153       if (dst.regClass() == v2b) {
2154          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
2155       } else if (dst.regClass() == v1) {
2156          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
2157       } else if (dst.regClass() == v2) {
2158          if (ctx->options->chip_class >= GFX7) {
2159             emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
2160          } else {
2161             /* GFX6 doesn't support V_RNDNE_F64, lower it. */
2162             Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1);
2163             bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0);
2164
2165             Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), bld.copy(bld.def(s1), Operand(-2u)));
2166             Temp bfi = bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, bld.copy(bld.def(v1), Operand(0x43300000u)), as_vgpr(ctx, src0_hi));
2167             Temp tmp = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2168             Instruction *sub = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), bfi));
2169             static_cast<VOP3A_instruction*>(sub)->neg[1] = true;
2170             tmp = sub->definitions[0].getTemp();
2171
2172             Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x432fffffu));
2173             Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v);
2174             static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2175             Temp cond = vop3->definitions[0].getTemp();
2176
2177             Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1);
2178             bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp);
2179             Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, as_vgpr(ctx, src0_lo), cond);
2180             Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, as_vgpr(ctx, src0_hi), cond);
2181
2182             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
2183          }
2184       } else {
2185          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2186          nir_print_instr(&instr->instr, stderr);
2187          fprintf(stderr, "\n");
2188       }
2189       break;
2190    }
2191    case nir_op_fsin:
2192    case nir_op_fcos: {
2193       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2194       aco_ptr<Instruction> norm;
2195       if (dst.regClass() == v2b) {
2196          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
2197          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
2198          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
2199          bld.vop1(opcode, Definition(dst), tmp);
2200       } else if (dst.regClass() == v1) {
2201          Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
2202          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
2203
2204          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
2205          if (ctx->options->chip_class < GFX9)
2206             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
2207
2208          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
2209          bld.vop1(opcode, Definition(dst), tmp);
2210       } else {
2211          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2212          nir_print_instr(&instr->instr, stderr);
2213          fprintf(stderr, "\n");
2214       }
2215       break;
2216    }
2217    case nir_op_ldexp: {
2218       Temp src0 = get_alu_src(ctx, instr->src[0]);
2219       Temp src1 = get_alu_src(ctx, instr->src[1]);
2220       if (dst.regClass() == v2b) {
2221          emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
2222       } else if (dst.regClass() == v1) {
2223          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
2224       } else if (dst.regClass() == v2) {
2225          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
2226       } else {
2227          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2228          nir_print_instr(&instr->instr, stderr);
2229          fprintf(stderr, "\n");
2230       }
2231       break;
2232    }
2233    case nir_op_frexp_sig: {
2234       Temp src = get_alu_src(ctx, instr->src[0]);
2235       if (dst.regClass() == v2b) {
2236          bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
2237       } else if (dst.regClass() == v1) {
2238          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
2239       } else if (dst.regClass() == v2) {
2240          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
2241       } else {
2242          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2243          nir_print_instr(&instr->instr, stderr);
2244          fprintf(stderr, "\n");
2245       }
2246       break;
2247    }
2248    case nir_op_frexp_exp: {
2249       Temp src = get_alu_src(ctx, instr->src[0]);
2250       if (instr->src[0].src.ssa->bit_size == 16) {
2251          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
2252          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
2253          convert_int(ctx, bld, tmp, 8, 32, true, dst);
2254       } else if (instr->src[0].src.ssa->bit_size == 32) {
2255          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
2256       } else if (instr->src[0].src.ssa->bit_size == 64) {
2257          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
2258       } else {
2259          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2260          nir_print_instr(&instr->instr, stderr);
2261          fprintf(stderr, "\n");
2262       }
2263       break;
2264    }
2265    case nir_op_fsign: {
2266       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
2267       if (dst.regClass() == v2b) {
2268          Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2269          Temp minus_one = bld.copy(bld.def(v1), Operand(0xbc00u));
2270          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2271          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
2272          cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2273          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
2274       } else if (dst.regClass() == v1) {
2275          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2276          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
2277          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2278          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
2279       } else if (dst.regClass() == v2) {
2280          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2281          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
2282          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, emit_extract_vector(ctx, src, 1, v1), cond);
2283
2284          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
2285          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
2286          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
2287
2288          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2289       } else {
2290          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2291          nir_print_instr(&instr->instr, stderr);
2292          fprintf(stderr, "\n");
2293       }
2294       break;
2295    }
2296    case nir_op_f2f16:
2297    case nir_op_f2f16_rtne: {
2298       Temp src = get_alu_src(ctx, instr->src[0]);
2299       if (instr->src[0].src.ssa->bit_size == 64)
2300          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2301       if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
2302          /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
2303           * keep value numbering and the scheduler simpler.
2304           */
2305          bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
2306       else
2307          bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
2308       break;
2309    }
2310    case nir_op_f2f16_rtz: {
2311       Temp src = get_alu_src(ctx, instr->src[0]);
2312       if (instr->src[0].src.ssa->bit_size == 64)
2313          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
2314       bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
2315       break;
2316    }
2317    case nir_op_f2f32: {
2318       if (instr->src[0].src.ssa->bit_size == 16) {
2319          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst);
2320       } else if (instr->src[0].src.ssa->bit_size == 64) {
2321          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
2322       } else {
2323          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2324          nir_print_instr(&instr->instr, stderr);
2325          fprintf(stderr, "\n");
2326       }
2327       break;
2328    }
2329    case nir_op_f2f64: {
2330       Temp src = get_alu_src(ctx, instr->src[0]);
2331       if (instr->src[0].src.ssa->bit_size == 16)
2332          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2333       bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src);
2334       break;
2335    }
2336    case nir_op_i2f16: {
2337       assert(dst.regClass() == v2b);
2338       Temp src = get_alu_src(ctx, instr->src[0]);
2339       if (instr->src[0].src.ssa->bit_size == 8)
2340          src = convert_int(ctx, bld, src, 8, 16, true);
2341       else if (instr->src[0].src.ssa->bit_size == 64)
2342          src = convert_int(ctx, bld, src, 64, 32, false);
2343       bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
2344       break;
2345    }
2346    case nir_op_i2f32: {
2347       assert(dst.size() == 1);
2348       Temp src = get_alu_src(ctx, instr->src[0]);
2349       if (instr->src[0].src.ssa->bit_size <= 16)
2350          src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2351       bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
2352       break;
2353    }
2354    case nir_op_i2f64: {
2355       if (instr->src[0].src.ssa->bit_size <= 32) {
2356          Temp src = get_alu_src(ctx, instr->src[0]);
2357          if (instr->src[0].src.ssa->bit_size <= 16)
2358             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2359          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
2360       } else if (instr->src[0].src.ssa->bit_size == 64) {
2361          Temp src = get_alu_src(ctx, instr->src[0]);
2362          RegClass rc = RegClass(src.type(), 1);
2363          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2364          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2365          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2366          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
2367          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2368          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2369
2370       } else {
2371          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2372          nir_print_instr(&instr->instr, stderr);
2373          fprintf(stderr, "\n");
2374       }
2375       break;
2376    }
2377    case nir_op_u2f16: {
2378       assert(dst.regClass() == v2b);
2379       Temp src = get_alu_src(ctx, instr->src[0]);
2380       if (instr->src[0].src.ssa->bit_size == 8)
2381          src = convert_int(ctx, bld, src, 8, 16, false);
2382       else if (instr->src[0].src.ssa->bit_size == 64)
2383          src = convert_int(ctx, bld, src, 64, 32, false);
2384       bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
2385       break;
2386    }
2387    case nir_op_u2f32: {
2388       assert(dst.size() == 1);
2389       Temp src = get_alu_src(ctx, instr->src[0]);
2390       if (instr->src[0].src.ssa->bit_size == 8) {
2391          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
2392       } else {
2393          if (instr->src[0].src.ssa->bit_size == 16)
2394             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
2395          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
2396       }
2397       break;
2398    }
2399    case nir_op_u2f64: {
2400       if (instr->src[0].src.ssa->bit_size <= 32) {
2401          Temp src = get_alu_src(ctx, instr->src[0]);
2402          if (instr->src[0].src.ssa->bit_size <= 16)
2403             src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
2404          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
2405       } else if (instr->src[0].src.ssa->bit_size == 64) {
2406          Temp src = get_alu_src(ctx, instr->src[0]);
2407          RegClass rc = RegClass(src.type(), 1);
2408          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
2409          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
2410          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
2411          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
2412          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
2413          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
2414       } else {
2415          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2416          nir_print_instr(&instr->instr, stderr);
2417          fprintf(stderr, "\n");
2418       }
2419       break;
2420    }
2421    case nir_op_f2i8:
2422    case nir_op_f2i16: {
2423       if (instr->src[0].src.ssa->bit_size == 16)
2424          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
2425       else if (instr->src[0].src.ssa->bit_size == 32)
2426          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2427       else
2428          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2429       break;
2430    }
2431    case nir_op_f2u8:
2432    case nir_op_f2u16: {
2433       if (instr->src[0].src.ssa->bit_size == 16)
2434          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
2435       else if (instr->src[0].src.ssa->bit_size == 32)
2436          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2437       else
2438          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2439       break;
2440    }
2441    case nir_op_f2i32: {
2442       Temp src = get_alu_src(ctx, instr->src[0]);
2443       if (instr->src[0].src.ssa->bit_size == 16) {
2444          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2445          if (dst.type() == RegType::vgpr) {
2446             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp);
2447          } else {
2448             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2449                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
2450          }
2451       } else if (instr->src[0].src.ssa->bit_size == 32) {
2452          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
2453       } else if (instr->src[0].src.ssa->bit_size == 64) {
2454          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
2455       } else {
2456          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2457          nir_print_instr(&instr->instr, stderr);
2458          fprintf(stderr, "\n");
2459       }
2460       break;
2461    }
2462    case nir_op_f2u32: {
2463       Temp src = get_alu_src(ctx, instr->src[0]);
2464       if (instr->src[0].src.ssa->bit_size == 16) {
2465          Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2466          if (dst.type() == RegType::vgpr) {
2467             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp);
2468          } else {
2469             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
2470                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
2471          }
2472       } else if (instr->src[0].src.ssa->bit_size == 32) {
2473          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
2474       } else if (instr->src[0].src.ssa->bit_size == 64) {
2475          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
2476       } else {
2477          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2478          nir_print_instr(&instr->instr, stderr);
2479          fprintf(stderr, "\n");
2480       }
2481       break;
2482    }
2483    case nir_op_f2i64: {
2484       Temp src = get_alu_src(ctx, instr->src[0]);
2485       if (instr->src[0].src.ssa->bit_size == 16)
2486          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2487
2488       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2489          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2490          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
2491          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2492          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
2493          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2494          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
2495          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2496          Temp new_exponent = bld.tmp(v1);
2497          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
2498          if (ctx->program->chip_class >= GFX8)
2499             mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
2500          else
2501             mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent);
2502          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
2503          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2504          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2505          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
2506          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
2507          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
2508          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
2509          Temp new_lower = bld.tmp(v1);
2510          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
2511          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
2512          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
2513
2514       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2515          if (src.type() == RegType::vgpr)
2516             src = bld.as_uniform(src);
2517          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2518          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2519          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2520          exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
2521          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2522          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
2523          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2524          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
2525          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2526          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
2527          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
2528          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
2529          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
2530          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
2531          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2532          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2533          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
2534          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
2535          Temp borrow = bld.tmp(s1);
2536          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
2537          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
2538          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2539
2540       } else if (instr->src[0].src.ssa->bit_size == 64) {
2541          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2542          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2543          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2544          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2545          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2546          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2547          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2548          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
2549          if (dst.type() == RegType::sgpr) {
2550             lower = bld.as_uniform(lower);
2551             upper = bld.as_uniform(upper);
2552          }
2553          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2554
2555       } else {
2556          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2557          nir_print_instr(&instr->instr, stderr);
2558          fprintf(stderr, "\n");
2559       }
2560       break;
2561    }
2562    case nir_op_f2u64: {
2563       Temp src = get_alu_src(ctx, instr->src[0]);
2564       if (instr->src[0].src.ssa->bit_size == 16)
2565          src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src);
2566
2567       if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) {
2568          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
2569          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), Operand(64u), exponent);
2570          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
2571          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
2572          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
2573          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
2574          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
2575          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
2576          Temp new_exponent = bld.tmp(v1);
2577          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
2578          if (ctx->program->chip_class >= GFX8)
2579             mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
2580          else
2581             mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent);
2582          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
2583          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2584          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
2585          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
2586          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
2587          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
2588          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2589
2590       } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) {
2591          if (src.type() == RegType::vgpr)
2592             src = bld.as_uniform(src);
2593          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
2594          exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
2595          exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
2596          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
2597          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
2598          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
2599          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
2600          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
2601          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
2602          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
2603          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
2604          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
2605          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
2606          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
2607          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
2608          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
2609          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
2610          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2611
2612       } else if (instr->src[0].src.ssa->bit_size == 64) {
2613          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
2614          Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src);
2615          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
2616          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
2617          Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul);
2618          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
2619          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
2620          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
2621          if (dst.type() == RegType::sgpr) {
2622             lower = bld.as_uniform(lower);
2623             upper = bld.as_uniform(upper);
2624          }
2625          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
2626
2627       } else {
2628          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2629          nir_print_instr(&instr->instr, stderr);
2630          fprintf(stderr, "\n");
2631       }
2632       break;
2633    }
2634    case nir_op_b2f16: {
2635       Temp src = get_alu_src(ctx, instr->src[0]);
2636       assert(src.regClass() == bld.lm);
2637
2638       if (dst.regClass() == s1) {
2639          src = bool_to_scalar_condition(ctx, src);
2640          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
2641       } else if (dst.regClass() == v2b) {
2642          Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
2643          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
2644       } else {
2645          unreachable("Wrong destination register class for nir_op_b2f16.");
2646       }
2647       break;
2648    }
2649    case nir_op_b2f32: {
2650       Temp src = get_alu_src(ctx, instr->src[0]);
2651       assert(src.regClass() == bld.lm);
2652
2653       if (dst.regClass() == s1) {
2654          src = bool_to_scalar_condition(ctx, src);
2655          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
2656       } else if (dst.regClass() == v1) {
2657          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
2658       } else {
2659          unreachable("Wrong destination register class for nir_op_b2f32.");
2660       }
2661       break;
2662    }
2663    case nir_op_b2f64: {
2664       Temp src = get_alu_src(ctx, instr->src[0]);
2665       assert(src.regClass() == bld.lm);
2666
2667       if (dst.regClass() == s2) {
2668          src = bool_to_scalar_condition(ctx, src);
2669          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
2670       } else if (dst.regClass() == v2) {
2671          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
2672          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
2673          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
2674       } else {
2675          unreachable("Wrong destination register class for nir_op_b2f64.");
2676       }
2677       break;
2678    }
2679    case nir_op_i2i8:
2680    case nir_op_i2i16:
2681    case nir_op_i2i32:
2682    case nir_op_i2i64: {
2683       convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2684                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
2685       break;
2686    }
2687    case nir_op_u2u8:
2688    case nir_op_u2u16:
2689    case nir_op_u2u32:
2690    case nir_op_u2u64: {
2691       convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
2692                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
2693       break;
2694    }
2695    case nir_op_b2b32:
2696    case nir_op_b2i32: {
2697       Temp src = get_alu_src(ctx, instr->src[0]);
2698       assert(src.regClass() == bld.lm);
2699
2700       if (dst.regClass() == s1) {
2701          // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
2702          bool_to_scalar_condition(ctx, src, dst);
2703       } else if (dst.regClass() == v1) {
2704          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
2705       } else {
2706          unreachable("Invalid register class for b2i32");
2707       }
2708       break;
2709    }
2710    case nir_op_b2b1:
2711    case nir_op_i2b1: {
2712       Temp src = get_alu_src(ctx, instr->src[0]);
2713       assert(dst.regClass() == bld.lm);
2714
2715       if (src.type() == RegType::vgpr) {
2716          assert(src.regClass() == v1 || src.regClass() == v2);
2717          assert(dst.regClass() == bld.lm);
2718          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
2719                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
2720       } else {
2721          assert(src.regClass() == s1 || src.regClass() == s2);
2722          Temp tmp;
2723          if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) {
2724             tmp = bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand(0u), src).def(1).getTemp();
2725          } else {
2726             tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32,
2727                            bld.scc(bld.def(s1)), Operand(0u), src);
2728          }
2729          bool_to_vector_condition(ctx, tmp, dst);
2730       }
2731       break;
2732    }
2733    case nir_op_pack_64_2x32_split: {
2734       Temp src0 = get_alu_src(ctx, instr->src[0]);
2735       Temp src1 = get_alu_src(ctx, instr->src[1]);
2736
2737       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2738       break;
2739    }
2740    case nir_op_unpack_64_2x32_split_x:
2741       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2742       break;
2743    case nir_op_unpack_64_2x32_split_y:
2744       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2745       break;
2746    case nir_op_unpack_32_2x16_split_x:
2747       if (dst.type() == RegType::vgpr) {
2748          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2749       } else {
2750          bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0]));
2751       }
2752       break;
2753    case nir_op_unpack_32_2x16_split_y:
2754       if (dst.type() == RegType::vgpr) {
2755          bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2756       } else {
2757          bld.sop2(aco_opcode::s_bfe_u32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]), Operand(uint32_t(16 << 16 | 16)));
2758       }
2759       break;
2760    case nir_op_pack_32_2x16_split: {
2761       Temp src0 = get_alu_src(ctx, instr->src[0]);
2762       Temp src1 = get_alu_src(ctx, instr->src[1]);
2763       if (dst.regClass() == v1) {
2764          src0 = emit_extract_vector(ctx, src0, 0, v2b);
2765          src1 = emit_extract_vector(ctx, src1, 0, v2b);
2766          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2767       } else {
2768          src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, Operand(0xFFFFu));
2769          src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, Operand(16u));
2770          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1);
2771       }
2772       break;
2773    }
2774    case nir_op_pack_half_2x16: {
2775       Temp src = get_alu_src(ctx, instr->src[0], 2);
2776
2777       if (dst.regClass() == v1) {
2778          Temp src0 = bld.tmp(v1);
2779          Temp src1 = bld.tmp(v1);
2780          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2781          if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
2782             bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2783          else
2784             bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
2785                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
2786                      bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
2787       } else {
2788          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2789          nir_print_instr(&instr->instr, stderr);
2790          fprintf(stderr, "\n");
2791       }
2792       break;
2793    }
2794    case nir_op_unpack_half_2x16_split_x: {
2795       if (dst.regClass() == v1) {
2796          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2797       } else {
2798          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2799          nir_print_instr(&instr->instr, stderr);
2800          fprintf(stderr, "\n");
2801       }
2802       break;
2803    }
2804    case nir_op_unpack_half_2x16_split_y: {
2805       if (dst.regClass() == v1) {
2806          /* TODO: use SDWA here */
2807          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2808                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2809       } else {
2810          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2811          nir_print_instr(&instr->instr, stderr);
2812          fprintf(stderr, "\n");
2813       }
2814       break;
2815    }
2816    case nir_op_fquantize2f16: {
2817       Temp src = get_alu_src(ctx, instr->src[0]);
2818       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
2819       Temp f32, cmp_res;
2820
2821       if (ctx->program->chip_class >= GFX8) {
2822          Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2823          cmp_res = bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask);
2824          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2825       } else {
2826          /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
2827           * so compare the result and flush to 0 if it's smaller.
2828           */
2829          f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2830          Temp smallest = bld.copy(bld.def(s1), Operand(0x38800000u));
2831          Instruction* vop3 = bld.vopc_e64(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), f32, smallest);
2832          static_cast<VOP3A_instruction*>(vop3)->abs[0] = true;
2833          cmp_res = vop3->definitions[0].getTemp();
2834       }
2835
2836       if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32 || ctx->program->chip_class < GFX8) {
2837          Temp copysign_0 = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0u), as_vgpr(ctx, src));
2838          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res);
2839       } else {
2840          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2841       }
2842       break;
2843    }
2844    case nir_op_bfm: {
2845       Temp bits = get_alu_src(ctx, instr->src[0]);
2846       Temp offset = get_alu_src(ctx, instr->src[1]);
2847
2848       if (dst.regClass() == s1) {
2849          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2850       } else if (dst.regClass() == v1) {
2851          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2852       } else {
2853          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2854          nir_print_instr(&instr->instr, stderr);
2855          fprintf(stderr, "\n");
2856       }
2857       break;
2858    }
2859    case nir_op_bitfield_select: {
2860       /* (mask & insert) | (~mask & base) */
2861       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2862       Temp insert = get_alu_src(ctx, instr->src[1]);
2863       Temp base = get_alu_src(ctx, instr->src[2]);
2864
2865       /* dst = (insert & bitmask) | (base & ~bitmask) */
2866       if (dst.regClass() == s1) {
2867          aco_ptr<Instruction> sop2;
2868          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2869          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2870          Operand lhs;
2871          if (const_insert && const_bitmask) {
2872             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2873          } else {
2874             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2875             lhs = Operand(insert);
2876          }
2877
2878          Operand rhs;
2879          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2880          if (const_base && const_bitmask) {
2881             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2882          } else {
2883             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2884             rhs = Operand(base);
2885          }
2886
2887          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2888
2889       } else if (dst.regClass() == v1) {
2890          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2891             base = as_vgpr(ctx, base);
2892          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2893             insert = as_vgpr(ctx, insert);
2894
2895          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2896
2897       } else {
2898          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2899          nir_print_instr(&instr->instr, stderr);
2900          fprintf(stderr, "\n");
2901       }
2902       break;
2903    }
2904    case nir_op_ubfe:
2905    case nir_op_ibfe: {
2906       Temp base = get_alu_src(ctx, instr->src[0]);
2907       Temp offset = get_alu_src(ctx, instr->src[1]);
2908       Temp bits = get_alu_src(ctx, instr->src[2]);
2909
2910       if (dst.type() == RegType::sgpr) {
2911          Operand extract;
2912          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2913          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2914          if (const_offset && const_bits) {
2915             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2916             extract = Operand(const_extract);
2917          } else {
2918             Operand width;
2919             if (const_bits) {
2920                width = Operand(const_bits->u32 << 16);
2921             } else {
2922                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2923             }
2924             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2925          }
2926
2927          aco_opcode opcode;
2928          if (dst.regClass() == s1) {
2929             if (instr->op == nir_op_ubfe)
2930                opcode = aco_opcode::s_bfe_u32;
2931             else
2932                opcode = aco_opcode::s_bfe_i32;
2933          } else if (dst.regClass() == s2) {
2934             if (instr->op == nir_op_ubfe)
2935                opcode = aco_opcode::s_bfe_u64;
2936             else
2937                opcode = aco_opcode::s_bfe_i64;
2938          } else {
2939             unreachable("Unsupported BFE bit size");
2940          }
2941
2942          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2943
2944       } else {
2945          aco_opcode opcode;
2946          if (dst.regClass() == v1) {
2947             if (instr->op == nir_op_ubfe)
2948                opcode = aco_opcode::v_bfe_u32;
2949             else
2950                opcode = aco_opcode::v_bfe_i32;
2951          } else {
2952             unreachable("Unsupported BFE bit size");
2953          }
2954
2955          emit_vop3a_instruction(ctx, instr, opcode, dst);
2956       }
2957       break;
2958    }
2959    case nir_op_bit_count: {
2960       Temp src = get_alu_src(ctx, instr->src[0]);
2961       if (src.regClass() == s1) {
2962          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2963       } else if (src.regClass() == v1) {
2964          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2965       } else if (src.regClass() == v2) {
2966          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2967                   emit_extract_vector(ctx, src, 1, v1),
2968                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2969                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2970       } else if (src.regClass() == s2) {
2971          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2972       } else {
2973          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2974          nir_print_instr(&instr->instr, stderr);
2975          fprintf(stderr, "\n");
2976       }
2977       break;
2978    }
2979    case nir_op_flt: {
2980       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, aco_opcode::v_cmp_lt_f64);
2981       break;
2982    }
2983    case nir_op_fge: {
2984       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, aco_opcode::v_cmp_ge_f64);
2985       break;
2986    }
2987    case nir_op_feq: {
2988       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
2989       break;
2990    }
2991    case nir_op_fne: {
2992       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
2993       break;
2994    }
2995    case nir_op_ilt: {
2996       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32);
2997       break;
2998    }
2999    case nir_op_ige: {
3000       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32);
3001       break;
3002    }
3003    case nir_op_ieq: {
3004       if (instr->src[0].src.ssa->bit_size == 1)
3005          emit_boolean_logic(ctx, instr, Builder::s_xnor, dst);
3006       else
3007          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32,
3008                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes);
3009       break;
3010    }
3011    case nir_op_ine: {
3012       if (instr->src[0].src.ssa->bit_size == 1)
3013          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
3014       else
3015          emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32,
3016                          ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes);
3017       break;
3018    }
3019    case nir_op_ult: {
3020       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32);
3021       break;
3022    }
3023    case nir_op_uge: {
3024       emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32);
3025       break;
3026    }
3027    case nir_op_fddx:
3028    case nir_op_fddy:
3029    case nir_op_fddx_fine:
3030    case nir_op_fddy_fine:
3031    case nir_op_fddx_coarse:
3032    case nir_op_fddy_coarse: {
3033       Temp src = get_alu_src(ctx, instr->src[0]);
3034       uint16_t dpp_ctrl1, dpp_ctrl2;
3035       if (instr->op == nir_op_fddx_fine) {
3036          dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2);
3037          dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3);
3038       } else if (instr->op == nir_op_fddy_fine) {
3039          dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1);
3040          dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3);
3041       } else {
3042          dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0);
3043          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
3044             dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1);
3045          else
3046             dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
3047       }
3048
3049       Temp tmp;
3050       if (ctx->program->chip_class >= GFX8) {
3051          Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1);
3052          tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2);
3053       } else {
3054          Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1);
3055          Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2);
3056          tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl);
3057       }
3058       emit_wqm(ctx, tmp, dst, true);
3059       break;
3060    }
3061    default:
3062       fprintf(stderr, "Unknown NIR ALU instr: ");
3063       nir_print_instr(&instr->instr, stderr);
3064       fprintf(stderr, "\n");
3065    }
3066 }
3067
3068 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
3069 {
3070    Temp dst = get_ssa_temp(ctx, &instr->def);
3071
3072    // TODO: we really want to have the resulting type as this would allow for 64bit literals
3073    // which get truncated the lsb if double and msb if int
3074    // for now, we only use s_mov_b64 with 64bit inline constants
3075    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
3076    assert(dst.type() == RegType::sgpr);
3077
3078    Builder bld(ctx->program, ctx->block);
3079
3080    if (instr->def.bit_size == 1) {
3081       assert(dst.regClass() == bld.lm);
3082       int val = instr->value[0].b ? -1 : 0;
3083       Operand op = bld.lm.size() == 1 ? Operand((uint32_t) val) : Operand((uint64_t) val);
3084       bld.sop1(Builder::s_mov, Definition(dst), op);
3085    } else if (instr->def.bit_size == 8) {
3086       /* ensure that the value is correctly represented in the low byte of the register */
3087       bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u8);
3088    } else if (instr->def.bit_size == 16) {
3089       /* ensure that the value is correctly represented in the low half of the register */
3090       bld.sopk(aco_opcode::s_movk_i32, Definition(dst), instr->value[0].u16);
3091    } else if (dst.size() == 1) {
3092       bld.copy(Definition(dst), Operand(instr->value[0].u32));
3093    } else {
3094       assert(dst.size() != 1);
3095       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3096       if (instr->def.bit_size == 64)
3097          for (unsigned i = 0; i < dst.size(); i++)
3098             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
3099       else {
3100          for (unsigned i = 0; i < dst.size(); i++)
3101             vec->operands[i] = Operand{instr->value[i].u32};
3102       }
3103       vec->definitions[0] = Definition(dst);
3104       ctx->block->instructions.emplace_back(std::move(vec));
3105    }
3106 }
3107
3108 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
3109 {
3110    uint32_t new_mask = 0;
3111    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
3112       if (mask & (1u << i))
3113          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
3114    return new_mask;
3115 }
3116
3117 struct LoadEmitInfo {
3118    Operand offset;
3119    Temp dst;
3120    unsigned num_components;
3121    unsigned component_size;
3122    Temp resource = Temp(0, s1);
3123    unsigned component_stride = 0;
3124    unsigned const_offset = 0;
3125    unsigned align_mul = 0;
3126    unsigned align_offset = 0;
3127
3128    bool glc = false;
3129    unsigned swizzle_component_size = 0;
3130    barrier_interaction barrier = barrier_none;
3131    bool can_reorder = true;
3132    Temp soffset = Temp(0, s1);
3133 };
3134
3135 using LoadCallback = Temp(*)(
3136    Builder& bld, const LoadEmitInfo* info, Temp offset, unsigned bytes_needed,
3137    unsigned align, unsigned const_offset, Temp dst_hint);
3138
3139 template <LoadCallback callback, bool byte_align_loads, bool supports_8bit_16bit_loads, unsigned max_const_offset_plus_one>
3140 void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
3141 {
3142    unsigned load_size = info->num_components * info->component_size;
3143    unsigned component_size = info->component_size;
3144
3145    unsigned num_vals = 0;
3146    Temp vals[info->dst.bytes()];
3147
3148    unsigned const_offset = info->const_offset;
3149
3150    unsigned align_mul = info->align_mul ? info->align_mul : component_size;
3151    unsigned align_offset = (info->align_offset + const_offset) % align_mul;
3152
3153    unsigned bytes_read = 0;
3154    while (bytes_read < load_size) {
3155       unsigned bytes_needed = load_size - bytes_read;
3156
3157       /* add buffer for unaligned loads */
3158       int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
3159
3160       if (byte_align) {
3161          if ((bytes_needed > 2 ||
3162               (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
3163               !supports_8bit_16bit_loads) && byte_align_loads) {
3164             if (info->component_stride) {
3165                assert(supports_8bit_16bit_loads && "unimplemented");
3166                bytes_needed = 2;
3167                byte_align = 0;
3168             } else {
3169                bytes_needed += byte_align == -1 ? 4 - info->align_mul : byte_align;
3170                bytes_needed = align(bytes_needed, 4);
3171             }
3172          } else {
3173             byte_align = 0;
3174          }
3175       }
3176
3177       if (info->swizzle_component_size)
3178          bytes_needed = MIN2(bytes_needed, info->swizzle_component_size);
3179       if (info->component_stride)
3180          bytes_needed = MIN2(bytes_needed, info->component_size);
3181
3182       bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4);
3183
3184       /* reduce constant offset */
3185       Operand offset = info->offset;
3186       unsigned reduced_const_offset = const_offset;
3187       bool remove_const_offset_completely = need_to_align_offset;
3188       if (const_offset && (remove_const_offset_completely || const_offset >= max_const_offset_plus_one)) {
3189          unsigned to_add = const_offset;
3190          if (remove_const_offset_completely) {
3191             reduced_const_offset = 0;
3192          } else {
3193             to_add = const_offset / max_const_offset_plus_one * max_const_offset_plus_one;
3194             reduced_const_offset %= max_const_offset_plus_one;
3195          }
3196          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3197          if (offset.isConstant()) {
3198             offset = Operand(offset.constantValue() + to_add);
3199          } else if (offset_tmp.regClass() == s1) {
3200             offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3201                               offset_tmp, Operand(to_add));
3202          } else if (offset_tmp.regClass() == v1) {
3203             offset = bld.vadd32(bld.def(v1), offset_tmp, Operand(to_add));
3204          } else {
3205             Temp lo = bld.tmp(offset_tmp.type(), 1);
3206             Temp hi = bld.tmp(offset_tmp.type(), 1);
3207             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3208
3209             if (offset_tmp.regClass() == s2) {
3210                Temp carry = bld.tmp(s1);
3211                lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, Operand(to_add));
3212                hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry);
3213                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi);
3214             } else {
3215                Temp new_lo = bld.tmp(v1);
3216                Temp carry = bld.vadd32(Definition(new_lo), lo, Operand(to_add), true).def(1).getTemp();
3217                hi = bld.vadd32(bld.def(v1), hi, Operand(0u), false, carry);
3218                offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi);
3219             }
3220          }
3221       }
3222
3223       /* align offset down if needed */
3224       Operand aligned_offset = offset;
3225       if (need_to_align_offset) {
3226          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
3227          if (offset.isConstant()) {
3228             aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
3229          } else if (offset_tmp.regClass() == s1) {
3230             aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfffffffcu), offset_tmp);
3231          } else if (offset_tmp.regClass() == s2) {
3232             aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand((uint64_t)0xfffffffffffffffcllu), offset_tmp);
3233          } else if (offset_tmp.regClass() == v1) {
3234             aligned_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), offset_tmp);
3235          } else if (offset_tmp.regClass() == v2) {
3236             Temp hi = bld.tmp(v1), lo = bld.tmp(v1);
3237             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp);
3238             lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xfffffffcu), lo);
3239             aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi);
3240          }
3241       }
3242       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
3243                                 bld.copy(bld.def(s1), aligned_offset);
3244
3245       unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
3246       Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
3247                           reduced_const_offset, byte_align ? Temp() : info->dst);
3248
3249       /* the callback wrote directly to dst */
3250       if (val == info->dst) {
3251          assert(num_vals == 0);
3252          emit_split_vector(ctx, info->dst, info->num_components);
3253          return;
3254       }
3255
3256       /* shift result right if needed */
3257       if (info->component_size < 4 && byte_align_loads) {
3258          Operand align((uint32_t)byte_align);
3259          if (byte_align == -1) {
3260             if (offset.isConstant())
3261                align = Operand(offset.constantValue() % 4u);
3262             else if (offset.size() == 2)
3263                align = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, RegClass(offset.getTemp().type(), 1)));
3264             else
3265                align = offset;
3266          }
3267
3268          assert(val.bytes() >= load_size && "unimplemented");
3269          if (val.type() == RegType::sgpr)
3270             byte_align_scalar(ctx, val, align, info->dst);
3271          else
3272             byte_align_vector(ctx, val, align, info->dst, component_size);
3273          return;
3274       }
3275
3276       /* add result to list and advance */
3277       if (info->component_stride) {
3278          assert(val.bytes() == info->component_size && "unimplemented");
3279          const_offset += info->component_stride;
3280          align_offset = (align_offset + info->component_stride) % align_mul;
3281       } else {
3282          const_offset += val.bytes();
3283          align_offset = (align_offset + val.bytes()) % align_mul;
3284       }
3285       bytes_read += val.bytes();
3286       vals[num_vals++] = val;
3287    }
3288
3289    /* create array of components */
3290    unsigned components_split = 0;
3291    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3292    bool has_vgprs = false;
3293    for (unsigned i = 0; i < num_vals;) {
3294       Temp tmp[num_vals];
3295       unsigned num_tmps = 0;
3296       unsigned tmp_size = 0;
3297       RegType reg_type = RegType::sgpr;
3298       while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) {
3299          if (vals[i].type() == RegType::vgpr)
3300             reg_type = RegType::vgpr;
3301          tmp_size += vals[i].bytes();
3302          tmp[num_tmps++] = vals[i++];
3303       }
3304       if (num_tmps > 1) {
3305          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3306             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
3307          for (unsigned i = 0; i < num_vals; i++)
3308             vec->operands[i] = Operand(tmp[i]);
3309          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
3310          vec->definitions[0] = Definition(tmp[0]);
3311          bld.insert(std::move(vec));
3312       }
3313
3314       if (tmp[0].bytes() % component_size) {
3315          /* trim tmp[0] */
3316          assert(i == num_vals);
3317          RegClass new_rc = RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size);
3318          tmp[0] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand(0u));
3319       }
3320
3321       RegClass elem_rc = RegClass::get(reg_type, component_size);
3322
3323       unsigned start = components_split;
3324
3325       if (tmp_size == elem_rc.bytes()) {
3326          allocated_vec[components_split++] = tmp[0];
3327       } else {
3328          assert(tmp_size % elem_rc.bytes() == 0);
3329          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(
3330             aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())};
3331          for (unsigned i = 0; i < split->definitions.size(); i++) {
3332             Temp component = bld.tmp(elem_rc);
3333             allocated_vec[components_split++] = component;
3334             split->definitions[i] = Definition(component);
3335          }
3336          split->operands[0] = Operand(tmp[0]);
3337          bld.insert(std::move(split));
3338       }
3339
3340       /* try to p_as_uniform early so we can create more optimizable code and
3341        * also update allocated_vec */
3342       for (unsigned j = start; j < components_split; j++) {
3343          if (allocated_vec[j].bytes() % 4 == 0 && info->dst.type() == RegType::sgpr)
3344             allocated_vec[j] = bld.as_uniform(allocated_vec[j]);
3345          has_vgprs |= allocated_vec[j].type() == RegType::vgpr;
3346       }
3347    }
3348
3349    /* concatenate components and p_as_uniform() result if needed */
3350    if (info->dst.type() == RegType::vgpr || !has_vgprs)
3351       ctx->allocated_vec.emplace(info->dst.id(), allocated_vec);
3352
3353    int padding_bytes = MAX2((int)info->dst.bytes() - int(allocated_vec[0].bytes() * info->num_components), 0);
3354
3355    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
3356       aco_opcode::p_create_vector, Format::PSEUDO, info->num_components + !!padding_bytes, 1)};
3357    for (unsigned i = 0; i < info->num_components; i++)
3358       vec->operands[i] = Operand(allocated_vec[i]);
3359    if (padding_bytes)
3360       vec->operands[info->num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes));
3361    if (info->dst.type() == RegType::sgpr && has_vgprs) {
3362       Temp tmp = bld.tmp(RegType::vgpr, info->dst.size());
3363       vec->definitions[0] = Definition(tmp);
3364       bld.insert(std::move(vec));
3365       bld.pseudo(aco_opcode::p_as_uniform, Definition(info->dst), tmp);
3366    } else {
3367       vec->definitions[0] = Definition(info->dst);
3368       bld.insert(std::move(vec));
3369    }
3370 }
3371
3372 Operand load_lds_size_m0(Builder& bld)
3373 {
3374    /* TODO: m0 does not need to be initialized on GFX9+ */
3375    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
3376 }
3377
3378 Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
3379                        Temp offset, unsigned bytes_needed,
3380                        unsigned align, unsigned const_offset,
3381                        Temp dst_hint)
3382 {
3383    offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
3384
3385    Operand m = load_lds_size_m0(bld);
3386
3387    bool large_ds_read = bld.program->chip_class >= GFX7;
3388    bool usable_read2 = bld.program->chip_class >= GFX7;
3389
3390    bool read2 = false;
3391    unsigned size = 0;
3392    aco_opcode op;
3393    //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
3394    if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
3395       size = 16;
3396       op = aco_opcode::ds_read_b128;
3397    } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
3398       size = 16;
3399       read2 = true;
3400       op = aco_opcode::ds_read2_b64;
3401    } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
3402       size = 12;
3403       op = aco_opcode::ds_read_b96;
3404    } else if (bytes_needed >= 8 && align % 8 == 0) {
3405       size = 8;
3406       op = aco_opcode::ds_read_b64;
3407    } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
3408       size = 8;
3409       read2 = true;
3410       op = aco_opcode::ds_read2_b32;
3411    } else if (bytes_needed >= 4 && align % 4 == 0) {
3412       size = 4;
3413       op = aco_opcode::ds_read_b32;
3414    } else if (bytes_needed >= 2 && align % 2 == 0) {
3415       size = 2;
3416       op = aco_opcode::ds_read_u16;
3417    } else {
3418       size = 1;
3419       op = aco_opcode::ds_read_u8;
3420    }
3421
3422    unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536;
3423    if (const_offset >= max_offset_plus_one) {
3424       offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one));
3425       const_offset %= max_offset_plus_one;
3426    }
3427
3428    if (read2)
3429       const_offset /= (size / 2u);
3430
3431    RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
3432    Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
3433    if (read2)
3434       bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
3435    else
3436       bld.ds(op, Definition(val), offset, m, const_offset);
3437
3438    if (size < 4)
3439       val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
3440
3441    return val;
3442 }
3443
3444 static auto emit_lds_load = emit_load<lds_load_callback, false, true, UINT32_MAX>;
3445
3446 Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
3447                         Temp offset, unsigned bytes_needed,
3448                         unsigned align, unsigned const_offset,
3449                         Temp dst_hint)
3450 {
3451    unsigned size = 0;
3452    aco_opcode op;
3453    if (bytes_needed <= 4) {
3454       size = 1;
3455       op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
3456    } else if (bytes_needed <= 8) {
3457       size = 2;
3458       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
3459    } else if (bytes_needed <= 16) {
3460       size = 4;
3461       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
3462    } else if (bytes_needed <= 32) {
3463       size = 8;
3464       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
3465    } else {
3466       size = 16;
3467       op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
3468    }
3469    aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3470    if (info->resource.id()) {
3471       load->operands[0] = Operand(info->resource);
3472       load->operands[1] = Operand(offset);
3473    } else {
3474       load->operands[0] = Operand(offset);
3475       load->operands[1] = Operand(0u);
3476    }
3477    RegClass rc(RegType::sgpr, size);
3478    Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
3479    load->definitions[0] = Definition(val);
3480    load->glc = info->glc;
3481    load->dlc = info->glc && bld.program->chip_class >= GFX10;
3482    load->barrier = info->barrier;
3483    load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
3484    bld.insert(std::move(load));
3485    return val;
3486 }
3487
3488 static auto emit_smem_load = emit_load<smem_load_callback, true, false, 1024>;
3489
3490 Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
3491                          Temp offset, unsigned bytes_needed,
3492                          unsigned align_, unsigned const_offset,
3493                          Temp dst_hint)
3494 {
3495    Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3496    Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3497
3498    if (info->soffset.id()) {
3499       if (soffset.isTemp())
3500          vaddr = bld.copy(bld.def(v1), soffset);
3501       soffset = Operand(info->soffset);
3502    }
3503
3504    unsigned bytes_size = 0;
3505    aco_opcode op;
3506    if (bytes_needed == 1) {
3507       bytes_size = 1;
3508       op = aco_opcode::buffer_load_ubyte;
3509    } else if (bytes_needed == 2) {
3510       bytes_size = 2;
3511       op = aco_opcode::buffer_load_ushort;
3512    } else if (bytes_needed <= 4) {
3513       bytes_size = 4;
3514       op = aco_opcode::buffer_load_dword;
3515    } else if (bytes_needed <= 8) {
3516       bytes_size = 8;
3517       op = aco_opcode::buffer_load_dwordx2;
3518    } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
3519       bytes_size = 12;
3520       op = aco_opcode::buffer_load_dwordx3;
3521    } else {
3522       bytes_size = 16;
3523       op = aco_opcode::buffer_load_dwordx4;
3524    }
3525    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3526    mubuf->operands[0] = Operand(info->resource);
3527    mubuf->operands[1] = vaddr;
3528    mubuf->operands[2] = soffset;
3529    mubuf->offen = (offset.type() == RegType::vgpr);
3530    mubuf->glc = info->glc;
3531    mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
3532    mubuf->barrier = info->barrier;
3533    mubuf->can_reorder = info->can_reorder;
3534    mubuf->offset = const_offset;
3535    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3536    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3537    mubuf->definitions[0] = Definition(val);
3538    bld.insert(std::move(mubuf));
3539
3540    return val;
3541 }
3542
3543 static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
3544
3545 Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
3546 {
3547    uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3548                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3549
3550    if (addr.type() == RegType::vgpr)
3551       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
3552    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
3553 }
3554
3555 Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
3556                           Temp offset, unsigned bytes_needed,
3557                           unsigned align_, unsigned const_offset,
3558                           Temp dst_hint)
3559 {
3560    unsigned bytes_size = 0;
3561    bool mubuf = bld.program->chip_class == GFX6;
3562    bool global = bld.program->chip_class >= GFX9;
3563    aco_opcode op;
3564    if (bytes_needed == 1) {
3565       bytes_size = 1;
3566       op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
3567    } else if (bytes_needed == 2) {
3568       bytes_size = 2;
3569       op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
3570    } else if (bytes_needed <= 4) {
3571       bytes_size = 4;
3572       op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
3573    } else if (bytes_needed <= 8) {
3574       bytes_size = 8;
3575       op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
3576    } else if (bytes_needed <= 12 && !mubuf) {
3577       bytes_size = 12;
3578       op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
3579    } else {
3580       bytes_size = 16;
3581       op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
3582    }
3583    RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
3584    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
3585    if (mubuf) {
3586       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3587       mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
3588       mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3589       mubuf->operands[2] = Operand(0u);
3590       mubuf->glc = info->glc;
3591       mubuf->dlc = false;
3592       mubuf->offset = 0;
3593       mubuf->addr64 = offset.type() == RegType::vgpr;
3594       mubuf->disable_wqm = false;
3595       mubuf->barrier = info->barrier;
3596       mubuf->definitions[0] = Definition(val);
3597       bld.insert(std::move(mubuf));
3598    } else {
3599       offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
3600
3601       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
3602       flat->operands[0] = Operand(offset);
3603       flat->operands[1] = Operand(s1);
3604       flat->glc = info->glc;
3605       flat->dlc = info->glc && bld.program->chip_class >= GFX10;
3606       flat->barrier = info->barrier;
3607       flat->offset = 0u;
3608       flat->definitions[0] = Definition(val);
3609       bld.insert(std::move(flat));
3610    }
3611
3612    return val;
3613 }
3614
3615 static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
3616
3617 Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
3618               Temp address, unsigned base_offset, unsigned align)
3619 {
3620    assert(util_is_power_of_two_nonzero(align));
3621
3622    Builder bld(ctx->program, ctx->block);
3623
3624    unsigned num_components = dst.bytes() / elem_size_bytes;
3625    LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
3626    info.align_mul = align;
3627    info.align_offset = 0;
3628    info.barrier = barrier_shared;
3629    info.can_reorder = false;
3630    info.const_offset = base_offset;
3631    emit_lds_load(ctx, bld, &info);
3632
3633    return dst;
3634 }
3635
3636 void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
3637 {
3638    if (!count)
3639       return;
3640
3641    Builder bld(ctx->program, ctx->block);
3642
3643    ASSERTED bool is_subdword = false;
3644    for (unsigned i = 0; i < count; i++)
3645       is_subdword |= offsets[i] % 4;
3646    is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
3647    assert(!is_subdword || dst_type == RegType::vgpr);
3648
3649    /* count == 1 fast path */
3650    if (count == 1) {
3651       if (dst_type == RegType::sgpr)
3652          dst[0] = bld.as_uniform(src);
3653       else
3654          dst[0] = as_vgpr(ctx, src);
3655       return;
3656    }
3657
3658    for (unsigned i = 0; i < count - 1; i++)
3659       dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
3660    dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
3661
3662    if (is_subdword && src.type() == RegType::sgpr) {
3663       src = as_vgpr(ctx, src);
3664    } else {
3665       /* use allocated_vec if possible */
3666       auto it = ctx->allocated_vec.find(src.id());
3667       if (it != ctx->allocated_vec.end()) {
3668          unsigned total_size = 0;
3669          for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
3670             total_size += it->second[i].bytes();
3671          if (total_size != src.bytes())
3672             goto split;
3673
3674          unsigned elem_size = it->second[0].bytes();
3675
3676          for (unsigned i = 0; i < count; i++) {
3677             if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
3678                goto split;
3679          }
3680
3681          for (unsigned i = 0; i < count; i++) {
3682             unsigned start_idx = offsets[i] / elem_size;
3683             unsigned op_count = dst[i].bytes() / elem_size;
3684             if (op_count == 1) {
3685                if (dst_type == RegType::sgpr)
3686                   dst[i] = bld.as_uniform(it->second[start_idx]);
3687                else
3688                   dst[i] = as_vgpr(ctx, it->second[start_idx]);
3689                continue;
3690             }
3691
3692             aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
3693             for (unsigned j = 0; j < op_count; j++) {
3694                Temp tmp = it->second[start_idx + j];
3695                if (dst_type == RegType::sgpr)
3696                   tmp = bld.as_uniform(tmp);
3697                vec->operands[j] = Operand(tmp);
3698             }
3699             vec->definitions[0] = Definition(dst[i]);
3700             bld.insert(std::move(vec));
3701          }
3702          return;
3703       }
3704    }
3705
3706    if (dst_type == RegType::sgpr)
3707       src = bld.as_uniform(src);
3708
3709    split:
3710    /* just split it */
3711    aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
3712    split->operands[0] = Operand(src);
3713    for (unsigned i = 0; i < count; i++)
3714       split->definitions[i] = Definition(dst[i]);
3715    bld.insert(std::move(split));
3716 }
3717
3718 bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
3719                      int *start, int *count)
3720 {
3721    unsigned start_elem = ffs(todo_mask) - 1;
3722    bool skip = !(mask & (1 << start_elem));
3723    if (skip)
3724       mask = ~mask & todo_mask;
3725
3726    mask &= todo_mask;
3727
3728    u_bit_scan_consecutive_range(&mask, start, count);
3729
3730    return !skip;
3731 }
3732
3733 void advance_write_mask(uint32_t *todo_mask, int start, int count)
3734 {
3735    *todo_mask &= ~u_bit_consecutive(0, count) << start;
3736 }
3737
3738 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
3739                Temp address, unsigned base_offset, unsigned align)
3740 {
3741    assert(util_is_power_of_two_nonzero(align));
3742    assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
3743
3744    Builder bld(ctx->program, ctx->block);
3745    bool large_ds_write = ctx->options->chip_class >= GFX7;
3746    bool usable_write2 = ctx->options->chip_class >= GFX7;
3747
3748    unsigned write_count = 0;
3749    Temp write_datas[32];
3750    unsigned offsets[32];
3751    aco_opcode opcodes[32];
3752
3753    wrmask = widen_mask(wrmask, elem_size_bytes);
3754
3755    uint32_t todo = u_bit_consecutive(0, data.bytes());
3756    while (todo) {
3757       int offset, bytes;
3758       if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
3759          offsets[write_count] = offset;
3760          opcodes[write_count] = aco_opcode::num_opcodes;
3761          write_count++;
3762          advance_write_mask(&todo, offset, bytes);
3763          continue;
3764       }
3765
3766       bool aligned2 = offset % 2 == 0 && align % 2 == 0;
3767       bool aligned4 = offset % 4 == 0 && align % 4 == 0;
3768       bool aligned8 = offset % 8 == 0 && align % 8 == 0;
3769       bool aligned16 = offset % 16 == 0 && align % 16 == 0;
3770
3771       //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
3772       aco_opcode op = aco_opcode::num_opcodes;
3773       if (bytes >= 16 && aligned16 && large_ds_write) {
3774          op = aco_opcode::ds_write_b128;
3775          bytes = 16;
3776       } else if (bytes >= 12 && aligned16 && large_ds_write) {
3777          op = aco_opcode::ds_write_b96;
3778          bytes = 12;
3779       } else if (bytes >= 8 && aligned8) {
3780          op = aco_opcode::ds_write_b64;
3781          bytes = 8;
3782       } else if (bytes >= 4 && aligned4) {
3783          op = aco_opcode::ds_write_b32;
3784          bytes = 4;
3785       } else if (bytes >= 2 && aligned2) {
3786          op = aco_opcode::ds_write_b16;
3787          bytes = 2;
3788       } else if (bytes >= 1) {
3789          op = aco_opcode::ds_write_b8;
3790          bytes = 1;
3791       } else {
3792          assert(false);
3793       }
3794
3795       offsets[write_count] = offset;
3796       opcodes[write_count] = op;
3797       write_count++;
3798       advance_write_mask(&todo, offset, bytes);
3799    }
3800
3801    Operand m = load_lds_size_m0(bld);
3802
3803    split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
3804
3805    for (unsigned i = 0; i < write_count; i++) {
3806       aco_opcode op = opcodes[i];
3807       if (op == aco_opcode::num_opcodes)
3808          continue;
3809
3810       Temp data = write_datas[i];
3811
3812       unsigned second = write_count;
3813       if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
3814          for (second = i + 1; second < write_count; second++) {
3815             if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
3816                op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
3817                opcodes[second] = aco_opcode::num_opcodes;
3818                break;
3819             }
3820          }
3821       }
3822
3823       bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
3824       unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
3825
3826       unsigned inline_offset = base_offset + offsets[i];
3827       unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
3828       Temp address_offset = address;
3829       if (inline_offset > max_offset) {
3830          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
3831          inline_offset = offsets[i];
3832       }
3833       assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
3834
3835       if (write2) {
3836          Temp second_data = write_datas[second];
3837          inline_offset /= data.bytes();
3838          bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
3839       } else {
3840          bld.ds(op, address_offset, data, m, inline_offset);
3841       }
3842    }
3843 }
3844
3845 unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
3846 {
3847    unsigned align = 16;
3848    if (const_offset)
3849       align = std::min(align, 1u << (ffs(const_offset) - 1));
3850
3851    return align;
3852 }
3853
3854
3855 aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
3856 {
3857    switch (bytes) {
3858    case 1:
3859       assert(!smem);
3860       return aco_opcode::buffer_store_byte;
3861    case 2:
3862       assert(!smem);
3863       return aco_opcode::buffer_store_short;
3864    case 4:
3865       return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
3866    case 8:
3867       return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
3868    case 12:
3869       assert(!smem);
3870       return aco_opcode::buffer_store_dwordx3;
3871    case 16:
3872       return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
3873    }
3874    unreachable("Unexpected store size");
3875    return aco_opcode::num_opcodes;
3876 }
3877
3878 void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
3879                         Temp data, unsigned writemask, int swizzle_element_size,
3880                         unsigned *write_count, Temp *write_datas, unsigned *offsets)
3881 {
3882    unsigned write_count_with_skips = 0;
3883    bool skips[16];
3884
3885    /* determine how to split the data */
3886    unsigned todo = u_bit_consecutive(0, data.bytes());
3887    while (todo) {
3888       int offset, bytes;
3889       skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
3890       offsets[write_count_with_skips] = offset;
3891       if (skips[write_count_with_skips]) {
3892          advance_write_mask(&todo, offset, bytes);
3893          write_count_with_skips++;
3894          continue;
3895       }
3896
3897       /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
3898        * larger than swizzle_element_size */
3899       bytes = MIN2(bytes, swizzle_element_size);
3900       if (bytes % 4)
3901          bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
3902
3903       /* SMEM and GFX6 VMEM can't emit 12-byte stores */
3904       if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
3905          bytes = 8;
3906
3907       /* dword or larger stores have to be dword-aligned */
3908       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
3909       unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
3910       bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
3911       if (!dword_aligned)
3912          bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
3913
3914       advance_write_mask(&todo, offset, bytes);
3915       write_count_with_skips++;
3916    }
3917
3918    /* actually split data */
3919    split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
3920
3921    /* remove skips */
3922    for (unsigned i = 0; i < write_count_with_skips; i++) {
3923       if (skips[i])
3924          continue;
3925       write_datas[*write_count] = write_datas[i];
3926       offsets[*write_count] = offsets[i];
3927       (*write_count)++;
3928    }
3929 }
3930
3931 Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
3932                            unsigned split_cnt = 0u, Temp dst = Temp())
3933 {
3934    Builder bld(ctx->program, ctx->block);
3935    unsigned dword_size = elem_size_bytes / 4;
3936
3937    if (!dst.id())
3938       dst = bld.tmp(RegClass(reg_type, cnt * dword_size));
3939
3940    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
3941    aco_ptr<Pseudo_instruction> instr {create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)};
3942    instr->definitions[0] = Definition(dst);
3943
3944    for (unsigned i = 0; i < cnt; ++i) {
3945       if (arr[i].id()) {
3946          assert(arr[i].size() == dword_size);
3947          allocated_vec[i] = arr[i];
3948          instr->operands[i] = Operand(arr[i]);
3949       } else {
3950          Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), Operand(0u, dword_size == 2));
3951          allocated_vec[i] = zero;
3952          instr->operands[i] = Operand(zero);
3953       }
3954    }
3955
3956    bld.insert(std::move(instr));
3957
3958    if (split_cnt)
3959       emit_split_vector(ctx, dst, split_cnt);
3960    else
3961       ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */
3962
3963    return dst;
3964 }
3965
3966 inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, unsigned const_offset)
3967 {
3968    if (const_offset >= 4096) {
3969       unsigned excess_const_offset = const_offset / 4096u * 4096u;
3970       const_offset %= 4096u;
3971
3972       if (!voffset.id())
3973          voffset = bld.copy(bld.def(v1), Operand(excess_const_offset));
3974       else if (unlikely(voffset.regClass() == s1))
3975          voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), Operand(excess_const_offset), Operand(voffset));
3976       else if (likely(voffset.regClass() == v1))
3977          voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand(excess_const_offset));
3978       else
3979          unreachable("Unsupported register class of voffset");
3980    }
3981
3982    return const_offset;
3983 }
3984
3985 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
3986                              unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
3987 {
3988    assert(vdata.id());
3989    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
3990    assert(vdata.size() >= 1 && vdata.size() <= 4);
3991
3992    Builder bld(ctx->program, ctx->block);
3993    aco_opcode op = get_buffer_store_op(false, vdata.bytes());
3994    const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
3995
3996    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
3997    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
3998    Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
3999                                  /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
4000                                  /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
4001
4002    static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
4003 }
4004
4005 void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
4006                                    unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
4007                                    bool allow_combining = true, bool reorder = true, bool slc = false)
4008 {
4009    Builder bld(ctx->program, ctx->block);
4010    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4011    assert(write_mask);
4012    write_mask = widen_mask(write_mask, elem_size_bytes);
4013
4014    unsigned write_count = 0;
4015    Temp write_datas[32];
4016    unsigned offsets[32];
4017    split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
4018                       allow_combining ? 16 : 4, &write_count, write_datas, offsets);
4019
4020    for (unsigned i = 0; i < write_count; i++) {
4021       unsigned const_offset = offsets[i] + base_const_offset;
4022       emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
4023    }
4024 }
4025
4026 void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
4027                      unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
4028                      unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
4029 {
4030    assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
4031    assert((num_components * elem_size_bytes) == dst.bytes());
4032    assert(!!stride != allow_combining);
4033
4034    Builder bld(ctx->program, ctx->block);
4035
4036    LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
4037    info.component_stride = allow_combining ? 0 : stride;
4038    info.glc = true;
4039    info.swizzle_component_size = allow_combining ? 0 : 4;
4040    info.align_mul = MIN2(elem_size_bytes, 4);
4041    info.align_offset = 0;
4042    info.soffset = soffset;
4043    info.const_offset = base_const_offset;
4044    emit_mubuf_load(ctx, bld, &info);
4045 }
4046
4047 std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
4048 {
4049    Builder bld(ctx->program, ctx->block);
4050    Temp offset = base_offset.first;
4051    unsigned const_offset = base_offset.second;
4052
4053    if (!nir_src_is_const(*off_src)) {
4054       Temp indirect_offset_arg = get_ssa_temp(ctx, off_src->ssa);
4055       Temp with_stride;
4056
4057       /* Calculate indirect offset with stride */
4058       if (likely(indirect_offset_arg.regClass() == v1))
4059          with_stride = bld.v_mul24_imm(bld.def(v1), indirect_offset_arg, stride);
4060       else if (indirect_offset_arg.regClass() == s1)
4061          with_stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), indirect_offset_arg);
4062       else
4063          unreachable("Unsupported register class of indirect offset");
4064
4065       /* Add to the supplied base offset */
4066       if (offset.id() == 0)
4067          offset = with_stride;
4068       else if (unlikely(offset.regClass() == s1 && with_stride.regClass() == s1))
4069          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), with_stride, offset);
4070       else if (offset.size() == 1 && with_stride.size() == 1)
4071          offset = bld.vadd32(bld.def(v1), with_stride, offset);
4072       else
4073          unreachable("Unsupported register class of indirect offset");
4074    } else {
4075       unsigned const_offset_arg = nir_src_as_uint(*off_src);
4076       const_offset += const_offset_arg * stride;
4077    }
4078
4079    return std::make_pair(offset, const_offset);
4080 }
4081
4082 std::pair<Temp, unsigned> offset_add(isel_context *ctx, const std::pair<Temp, unsigned> &off1, const std::pair<Temp, unsigned> &off2)
4083 {
4084    Builder bld(ctx->program, ctx->block);
4085    Temp offset;
4086
4087    if (off1.first.id() && off2.first.id()) {
4088       if (unlikely(off1.first.regClass() == s1 && off2.first.regClass() == s1))
4089          offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), off1.first, off2.first);
4090       else if (off1.first.size() == 1 && off2.first.size() == 1)
4091          offset = bld.vadd32(bld.def(v1), off1.first, off2.first);
4092       else
4093          unreachable("Unsupported register class of indirect offset");
4094    } else {
4095       offset = off1.first.id() ? off1.first : off2.first;
4096    }
4097
4098    return std::make_pair(offset, off1.second + off2.second);
4099 }
4100
4101 std::pair<Temp, unsigned> offset_mul(isel_context *ctx, const std::pair<Temp, unsigned> &offs, unsigned multiplier)
4102 {
4103    Builder bld(ctx->program, ctx->block);
4104    unsigned const_offset = offs.second * multiplier;
4105
4106    if (!offs.first.id())
4107       return std::make_pair(offs.first, const_offset);
4108
4109    Temp offset = unlikely(offs.first.regClass() == s1)
4110                  ? bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(multiplier), offs.first)
4111                  : bld.v_mul24_imm(bld.def(v1), offs.first, multiplier);
4112
4113    return std::make_pair(offset, const_offset);
4114 }
4115
4116 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride, unsigned component_stride)
4117 {
4118    Builder bld(ctx->program, ctx->block);
4119
4120    /* base is the driver_location, which is already multiplied by 4, so is in dwords */
4121    unsigned const_offset = nir_intrinsic_base(instr) * base_stride;
4122    /* component is in bytes */
4123    const_offset += nir_intrinsic_component(instr) * component_stride;
4124
4125    /* offset should be interpreted in relation to the base, so the instruction effectively reads/writes another input/output when it has an offset */
4126    nir_src *off_src = nir_get_io_offset_src(instr);
4127    return offset_add_from_nir(ctx, std::make_pair(Temp(), const_offset), off_src, 4u * base_stride);
4128 }
4129
4130 std::pair<Temp, unsigned> get_intrinsic_io_basic_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned stride = 1u)
4131 {
4132    return get_intrinsic_io_basic_offset(ctx, instr, stride, stride);
4133 }
4134
4135 Temp get_tess_rel_patch_id(isel_context *ctx)
4136 {
4137    Builder bld(ctx->program, ctx->block);
4138
4139    switch (ctx->shader->info.stage) {
4140    case MESA_SHADER_TESS_CTRL:
4141       return bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffu),
4142                       get_arg(ctx, ctx->args->ac.tcs_rel_ids));
4143    case MESA_SHADER_TESS_EVAL:
4144       return get_arg(ctx, ctx->args->tes_rel_patch_id);
4145    default:
4146       unreachable("Unsupported stage in get_tess_rel_patch_id");
4147    }
4148 }
4149
4150 std::pair<Temp, unsigned> get_tcs_per_vertex_input_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4151 {
4152    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4153    Builder bld(ctx->program, ctx->block);
4154
4155    uint32_t tcs_in_patch_stride = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 4;
4156    uint32_t tcs_in_vertex_stride = ctx->tcs_num_inputs * 4;
4157
4158    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr);
4159
4160    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4161    offs = offset_add_from_nir(ctx, offs, vertex_index_src, tcs_in_vertex_stride);
4162
4163    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4164    Temp tcs_in_current_patch_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, tcs_in_patch_stride);
4165    offs = offset_add(ctx, offs, std::make_pair(tcs_in_current_patch_offset, 0));
4166
4167    return offset_mul(ctx, offs, 4u);
4168 }
4169
4170 std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, bool per_vertex = false)
4171 {
4172    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4173    Builder bld(ctx->program, ctx->block);
4174
4175    uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
4176    uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
4177    uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4178    uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
4179
4180    std::pair<Temp, unsigned> offs = instr
4181                                     ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
4182                                     : std::make_pair(Temp(), 0u);
4183
4184    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4185    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, output_patch_stride);
4186
4187    if (per_vertex) {
4188       assert(instr);
4189
4190       nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4191       offs = offset_add_from_nir(ctx, offs, vertex_index_src, output_vertex_size);
4192
4193       uint32_t output_patch0_offset = (input_patch_size * ctx->tcs_num_patches);
4194       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_offset));
4195    } else {
4196       uint32_t output_patch0_patch_data_offset = (input_patch_size * ctx->tcs_num_patches + pervertex_output_patch_size);
4197       offs = offset_add(ctx, offs, std::make_pair(patch_off, output_patch0_patch_data_offset));
4198    }
4199
4200    return offs;
4201 }
4202
4203 std::pair<Temp, unsigned> get_tcs_per_vertex_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr)
4204 {
4205    Builder bld(ctx->program, ctx->block);
4206
4207    unsigned vertices_per_patch = ctx->shader->info.tess.tcs_vertices_out;
4208    unsigned attr_stride = vertices_per_patch * ctx->tcs_num_patches;
4209
4210    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u);
4211
4212    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4213    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, vertices_per_patch * 16u);
4214    offs = offset_add(ctx, offs, std::make_pair(patch_off, 0u));
4215
4216    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4217    offs = offset_add_from_nir(ctx, offs, vertex_index_src, 16u);
4218
4219    return offs;
4220 }
4221
4222 std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx, nir_intrinsic_instr *instr = nullptr, unsigned const_base_offset = 0u)
4223 {
4224    Builder bld(ctx->program, ctx->block);
4225
4226    unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
4227    unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
4228    unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
4229    unsigned attr_stride = ctx->tcs_num_patches;
4230
4231    std::pair<Temp, unsigned> offs = instr
4232                                     ? get_intrinsic_io_basic_offset(ctx, instr, attr_stride * 4u, 4u)
4233                                     : std::make_pair(Temp(), 0u);
4234
4235    if (const_base_offset)
4236       offs.second += const_base_offset * attr_stride;
4237
4238    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
4239    Temp patch_off = bld.v_mul24_imm(bld.def(v1), rel_patch_id, 16u);
4240    offs = offset_add(ctx, offs, std::make_pair(patch_off, per_patch_data_offset));
4241
4242    return offs;
4243 }
4244
4245 bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
4246 {
4247    assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4248
4249    if (mask == 0)
4250       return false;
4251
4252    unsigned drv_loc = nir_intrinsic_base(instr);
4253    nir_src *off_src = nir_get_io_offset_src(instr);
4254
4255    if (!nir_src_is_const(*off_src)) {
4256       *indirect = true;
4257       return false;
4258    }
4259
4260    *indirect = false;
4261    uint64_t slot = per_vertex
4262                    ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
4263                    : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
4264    return (((uint64_t) 1) << slot) & mask;
4265 }
4266
4267 bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
4268 {
4269    unsigned write_mask = nir_intrinsic_write_mask(instr);
4270    unsigned component = nir_intrinsic_component(instr);
4271    unsigned idx = nir_intrinsic_base(instr) + component;
4272
4273    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
4274    if (off_instr->type != nir_instr_type_load_const)
4275       return false;
4276
4277    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4278    idx += nir_src_as_uint(instr->src[1]) * 4u;
4279
4280    if (instr->src[0].ssa->bit_size == 64)
4281       write_mask = widen_mask(write_mask, 2);
4282
4283    RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
4284
4285    for (unsigned i = 0; i < 8; ++i) {
4286       if (write_mask & (1 << i)) {
4287          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
4288          ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
4289       }
4290       idx++;
4291    }
4292
4293    return true;
4294 }
4295
4296 bool load_input_from_temps(isel_context *ctx, nir_intrinsic_instr *instr, Temp dst)
4297 {
4298    /* Only TCS per-vertex inputs are supported by this function.
4299     * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations is the same.
4300     */
4301    if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq)
4302       return false;
4303
4304    nir_src *off_src = nir_get_io_offset_src(instr);
4305    nir_src *vertex_index_src = nir_get_io_vertex_index_src(instr);
4306    nir_instr *vertex_index_instr = vertex_index_src->ssa->parent_instr;
4307    bool can_use_temps = nir_src_is_const(*off_src) &&
4308                         vertex_index_instr->type == nir_instr_type_intrinsic &&
4309                         nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id;
4310
4311    if (!can_use_temps)
4312       return false;
4313
4314    unsigned idx = nir_intrinsic_base(instr) + nir_intrinsic_component(instr) + 4 * nir_src_as_uint(*off_src);
4315    Temp *src = &ctx->inputs.temps[idx];
4316    create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst);
4317
4318    return true;
4319 }
4320
4321 void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
4322 {
4323    Builder bld(ctx->program, ctx->block);
4324
4325    if (ctx->tcs_in_out_eq && store_output_to_temps(ctx, instr)) {
4326       /* When the TCS only reads this output directly and for the same vertices as its invocation id, it is unnecessary to store the VS output to LDS. */
4327       bool indirect_write;
4328       bool temp_only_input = tcs_driver_location_matches_api_mask(ctx, instr, true, ctx->tcs_temp_only_inputs, &indirect_write);
4329       if (temp_only_input && !indirect_write)
4330          return;
4331    }
4332
4333    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, 4u);
4334    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
4335    unsigned write_mask = nir_intrinsic_write_mask(instr);
4336    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8u;
4337
4338    if (ctx->stage == vertex_es || ctx->stage == tess_eval_es) {
4339       /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
4340       Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
4341       Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
4342       store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
4343    } else {
4344       Temp lds_base;
4345
4346       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4347          /* GFX9+: ES stage is merged into GS, data is passed between them using LDS. */
4348          unsigned itemsize = ctx->stage == vertex_geometry_gs
4349                              ? ctx->program->info->vs.es_info.esgs_itemsize
4350                              : ctx->program->info->tes.es_info.esgs_itemsize;
4351          Temp thread_id = emit_mbcnt(ctx, bld.def(v1));
4352          Temp wave_idx = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), get_arg(ctx, ctx->args->merged_wave_info), Operand(4u << 16 | 24));
4353          Temp vertex_idx = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), thread_id,
4354                                bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_idx), ctx->program->wave_size));
4355          lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, itemsize);
4356       } else if (ctx->stage == vertex_ls || ctx->stage == vertex_tess_control_hs) {
4357          /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
4358           * GFX9+: LS is merged into HS, but still uses the same LDS layout.
4359           */
4360          Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
4361          lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
4362       } else {
4363          unreachable("Invalid LS or ES stage");
4364       }
4365
4366       offs = offset_add(ctx, offs, std::make_pair(lds_base, 0u));
4367       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
4368       store_lds(ctx, elem_size_bytes, src, write_mask, offs.first, offs.second, lds_align);
4369    }
4370 }
4371
4372 bool tcs_output_is_tess_factor(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4373 {
4374    if (per_vertex)
4375       return false;
4376
4377    unsigned off = nir_intrinsic_base(instr) * 4u;
4378    return off == ctx->tcs_tess_lvl_out_loc ||
4379           off == ctx->tcs_tess_lvl_in_loc;
4380
4381 }
4382
4383 bool tcs_output_is_read_by_tes(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4384 {
4385    uint64_t mask = per_vertex
4386                    ? ctx->program->info->tcs.tes_inputs_read
4387                    : ctx->program->info->tcs.tes_patch_inputs_read;
4388
4389    bool indirect_write = false;
4390    bool output_read_by_tes = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4391    return indirect_write || output_read_by_tes;
4392 }
4393
4394 bool tcs_output_is_read_by_tcs(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4395 {
4396    uint64_t mask = per_vertex
4397                    ? ctx->shader->info.outputs_read
4398                    : ctx->shader->info.patch_outputs_read;
4399
4400    bool indirect_write = false;
4401    bool output_read = tcs_driver_location_matches_api_mask(ctx, instr, per_vertex, mask, &indirect_write);
4402    return indirect_write || output_read;
4403 }
4404
4405 void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4406 {
4407    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4408    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4409
4410    Builder bld(ctx->program, ctx->block);
4411
4412    Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa);
4413    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4414    unsigned write_mask = nir_intrinsic_write_mask(instr);
4415
4416    bool is_tess_factor = tcs_output_is_tess_factor(ctx, instr, per_vertex);
4417    bool write_to_vmem = !is_tess_factor && tcs_output_is_read_by_tes(ctx, instr, per_vertex);
4418    bool write_to_lds = is_tess_factor || tcs_output_is_read_by_tcs(ctx, instr, per_vertex);
4419
4420    if (write_to_vmem) {
4421       std::pair<Temp, unsigned> vmem_offs = per_vertex
4422                                             ? get_tcs_per_vertex_output_vmem_offset(ctx, instr)
4423                                             : get_tcs_per_patch_output_vmem_offset(ctx, instr);
4424
4425       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4426       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
4427       store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
4428    }
4429
4430    if (write_to_lds) {
4431       std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4432       unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4433       store_lds(ctx, elem_size_bytes, store_val, write_mask, lds_offs.first, lds_offs.second, lds_align);
4434    }
4435 }
4436
4437 void visit_load_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex)
4438 {
4439    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
4440    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
4441
4442    Builder bld(ctx->program, ctx->block);
4443
4444    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4445    std::pair<Temp, unsigned> lds_offs = get_tcs_output_lds_offset(ctx, instr, per_vertex);
4446    unsigned lds_align = calculate_lds_alignment(ctx, lds_offs.second);
4447    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4448
4449    load_lds(ctx, elem_size_bytes, dst, lds_offs.first, lds_offs.second, lds_align);
4450 }
4451
4452 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
4453 {
4454    if (ctx->stage == vertex_vs ||
4455        ctx->stage == tess_eval_vs ||
4456        ctx->stage == fragment_fs ||
4457        ctx->stage == ngg_vertex_gs ||
4458        ctx->stage == ngg_tess_eval_gs ||
4459        ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
4460       bool stored_to_temps = store_output_to_temps(ctx, instr);
4461       if (!stored_to_temps) {
4462          fprintf(stderr, "Unimplemented output offset instruction:\n");
4463          nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
4464          fprintf(stderr, "\n");
4465          abort();
4466       }
4467    } else if (ctx->stage == vertex_es ||
4468               ctx->stage == vertex_ls ||
4469               ctx->stage == tess_eval_es ||
4470               (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4471               (ctx->stage == vertex_geometry_gs && ctx->shader->info.stage == MESA_SHADER_VERTEX) ||
4472               (ctx->stage == tess_eval_geometry_gs && ctx->shader->info.stage == MESA_SHADER_TESS_EVAL)) {
4473       visit_store_ls_or_es_output(ctx, instr);
4474    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
4475       visit_store_tcs_output(ctx, instr, false);
4476    } else {
4477       unreachable("Shader stage not implemented");
4478    }
4479 }
4480
4481 void visit_load_output(isel_context *ctx, nir_intrinsic_instr *instr)
4482 {
4483    visit_load_tcs_output(ctx, instr, false);
4484 }
4485
4486 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
4487 {
4488    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
4489    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
4490
4491    Builder bld(ctx->program, ctx->block);
4492
4493    if (dst.regClass() == v2b) {
4494       if (ctx->program->has_16bank_lds) {
4495          assert(ctx->options->chip_class <= GFX8);
4496          Builder::Result interp_p1 =
4497             bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
4498                        Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
4499          interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
4500                                 coord1, bld.m0(prim_mask), interp_p1, idx, component);
4501          bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
4502                  bld.m0(prim_mask), interp_p1, idx, component);
4503       } else {
4504          aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
4505
4506          if (ctx->options->chip_class == GFX8)
4507             interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
4508
4509          Builder::Result interp_p1 =
4510             bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
4511                        coord1, bld.m0(prim_mask), idx, component);
4512          bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
4513                     interp_p1, idx, component);
4514       }
4515    } else {
4516       Builder::Result interp_p1 =
4517          bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
4518                     bld.m0(prim_mask), idx, component);
4519
4520       if (ctx->program->has_16bank_lds)
4521          interp_p1.instr->operands[0].setLateKill(true);
4522
4523       bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
4524                  bld.m0(prim_mask), interp_p1, idx, component);
4525    }
4526 }
4527
4528 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
4529 {
4530    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
4531    for (unsigned i = 0; i < num_components; i++)
4532       vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i]));
4533    if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) {
4534       assert(num_components == 4);
4535       Builder bld(ctx->program, ctx->block);
4536       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3]));
4537    }
4538
4539    for (Operand& op : vec->operands)
4540       op = op.isUndefined() ? Operand(0u) : op;
4541
4542    vec->definitions[0] = Definition(dst);
4543    ctx->block->instructions.emplace_back(std::move(vec));
4544    emit_split_vector(ctx, dst, num_components);
4545    return;
4546 }
4547
4548 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
4549 {
4550    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4551    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
4552    unsigned idx = nir_intrinsic_base(instr);
4553    unsigned component = nir_intrinsic_component(instr);
4554    Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4555
4556    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
4557    if (offset) {
4558       assert(offset->u32 == 0);
4559    } else {
4560       /* the lower 15bit of the prim_mask contain the offset into LDS
4561        * while the upper bits contain the number of prims */
4562       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
4563       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4564       Builder bld(ctx->program, ctx->block);
4565       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4566       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4567       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4568       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4569       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4570    }
4571
4572    if (instr->dest.ssa.num_components == 1) {
4573       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
4574    } else {
4575       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
4576       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
4577       {
4578          Temp tmp = {ctx->program->allocateId(), v1};
4579          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
4580          vec->operands[i] = Operand(tmp);
4581       }
4582       vec->definitions[0] = Definition(dst);
4583       ctx->block->instructions.emplace_back(std::move(vec));
4584    }
4585 }
4586
4587 bool check_vertex_fetch_size(isel_context *ctx, const ac_data_format_info *vtx_info,
4588                              unsigned offset, unsigned stride, unsigned channels)
4589 {
4590    unsigned vertex_byte_size = vtx_info->chan_byte_size * channels;
4591    if (vtx_info->chan_byte_size != 4 && channels == 3)
4592       return false;
4593    return (ctx->options->chip_class != GFX6 && ctx->options->chip_class != GFX10) ||
4594           (offset % vertex_byte_size == 0 && stride % vertex_byte_size == 0);
4595 }
4596
4597 uint8_t get_fetch_data_format(isel_context *ctx, const ac_data_format_info *vtx_info,
4598                               unsigned offset, unsigned stride, unsigned *channels)
4599 {
4600    if (!vtx_info->chan_byte_size) {
4601       *channels = vtx_info->num_channels;
4602       return vtx_info->chan_format;
4603    }
4604
4605    unsigned num_channels = *channels;
4606    if (!check_vertex_fetch_size(ctx, vtx_info, offset, stride, *channels)) {
4607       unsigned new_channels = num_channels + 1;
4608       /* first, assume more loads is worse and try using a larger data format */
4609       while (new_channels <= 4 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels)) {
4610          new_channels++;
4611          /* don't make the attribute potentially out-of-bounds */
4612          if (offset + new_channels * vtx_info->chan_byte_size > stride)
4613             new_channels = 5;
4614       }
4615
4616       if (new_channels == 5) {
4617          /* then try decreasing load size (at the cost of more loads) */
4618          new_channels = *channels;
4619          while (new_channels > 1 && !check_vertex_fetch_size(ctx, vtx_info, offset, stride, new_channels))
4620             new_channels--;
4621       }
4622
4623       if (new_channels < *channels)
4624          *channels = new_channels;
4625       num_channels = new_channels;
4626    }
4627
4628    switch (vtx_info->chan_format) {
4629    case V_008F0C_BUF_DATA_FORMAT_8:
4630       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8,
4631                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1];
4632    case V_008F0C_BUF_DATA_FORMAT_16:
4633       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16,
4634                          V_008F0C_BUF_DATA_FORMAT_INVALID, V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1];
4635    case V_008F0C_BUF_DATA_FORMAT_32:
4636       return (uint8_t[]){V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32,
4637                          V_008F0C_BUF_DATA_FORMAT_32_32_32, V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1];
4638    }
4639    unreachable("shouldn't reach here");
4640    return V_008F0C_BUF_DATA_FORMAT_INVALID;
4641 }
4642
4643 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
4644  * so we may need to fix it up. */
4645 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
4646 {
4647    Builder bld(ctx->program, ctx->block);
4648
4649    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
4650       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
4651
4652    /* For the integer-like cases, do a natural sign extension.
4653     *
4654     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
4655     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
4656     * exponent.
4657     */
4658    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
4659    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
4660
4661    /* Convert back to the right type. */
4662    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
4663       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4664       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0xbf800000u), alpha);
4665       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
4666    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
4667       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
4668    }
4669
4670    return alpha;
4671 }
4672
4673 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
4674 {
4675    Builder bld(ctx->program, ctx->block);
4676    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4677    if (ctx->shader->info.stage == MESA_SHADER_VERTEX) {
4678
4679       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
4680       if (off_instr->type != nir_instr_type_load_const) {
4681          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4682          nir_print_instr(off_instr, stderr);
4683          fprintf(stderr, "\n");
4684       }
4685       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
4686
4687       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->vertex_buffers));
4688
4689       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
4690       unsigned component = nir_intrinsic_component(instr);
4691       unsigned bitsize = instr->dest.ssa.bit_size;
4692       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
4693       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
4694       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
4695       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
4696
4697       unsigned dfmt = attrib_format & 0xf;
4698       unsigned nfmt = (attrib_format >> 4) & 0x7;
4699       const struct ac_data_format_info *vtx_info = ac_get_data_format_info(dfmt);
4700
4701       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
4702       unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels);
4703       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
4704       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
4705       if (post_shuffle)
4706          num_channels = MAX2(num_channels, 3);
4707
4708       Operand off = bld.copy(bld.def(s1), Operand(attrib_binding * 16u));
4709       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off);
4710
4711       Temp index;
4712       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
4713          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
4714          Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance);
4715          if (divisor) {
4716             Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id);
4717             if (divisor != 1) {
4718                Temp divided = bld.tmp(v1);
4719                emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor);
4720                index = bld.vadd32(bld.def(v1), start_instance, divided);
4721             } else {
4722                index = bld.vadd32(bld.def(v1), start_instance, instance_id);
4723             }
4724          } else {
4725             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), start_instance);
4726          }
4727       } else {
4728          index = bld.vadd32(bld.def(v1),
4729                             get_arg(ctx, ctx->args->ac.base_vertex),
4730                             get_arg(ctx, ctx->args->ac.vertex_id));
4731       }
4732
4733       Temp channels[num_channels];
4734       unsigned channel_start = 0;
4735       bool direct_fetch = false;
4736
4737       /* skip unused channels at the start */
4738       if (vtx_info->chan_byte_size && !post_shuffle) {
4739          channel_start = ffs(mask) - 1;
4740          for (unsigned i = 0; i < channel_start; i++)
4741             channels[i] = Temp(0, s1);
4742       } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) {
4743          num_channels = 3 - (ffs(mask) - 1);
4744       }
4745
4746       /* load channels */
4747       while (channel_start < num_channels) {
4748          unsigned fetch_component = num_channels - channel_start;
4749          unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
4750          bool expanded = false;
4751
4752          /* use MUBUF when possible to avoid possible alignment issues */
4753          /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */
4754          bool use_mubuf = (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT ||
4755                            nfmt == V_008F0C_BUF_NUM_FORMAT_UINT ||
4756                            nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) &&
4757                           vtx_info->chan_byte_size == 4;
4758          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
4759          if (!use_mubuf) {
4760             fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
4761          } else {
4762             if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
4763                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
4764                fetch_component = 4;
4765                expanded = true;
4766             }
4767          }
4768
4769          unsigned fetch_bytes = fetch_component * bitsize / 8;
4770
4771          Temp fetch_index = index;
4772          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
4773             fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
4774             fetch_offset = fetch_offset % attrib_stride;
4775          }
4776
4777          Operand soffset(0u);
4778          if (fetch_offset >= 4096) {
4779             soffset = bld.copy(bld.def(s1), Operand(fetch_offset / 4096 * 4096));
4780             fetch_offset %= 4096;
4781          }
4782
4783          aco_opcode opcode;
4784          switch (fetch_bytes) {
4785          case 2:
4786             assert(!use_mubuf && bitsize == 16);
4787             opcode = aco_opcode::tbuffer_load_format_d16_x;
4788             break;
4789          case 4:
4790             if (bitsize == 16) {
4791                assert(!use_mubuf);
4792                opcode = aco_opcode::tbuffer_load_format_d16_xy;
4793             } else {
4794                opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
4795             }
4796             break;
4797          case 6:
4798             assert(!use_mubuf && bitsize == 16);
4799             opcode = aco_opcode::tbuffer_load_format_d16_xyz;
4800             break;
4801          case 8:
4802             if (bitsize == 16) {
4803                assert(!use_mubuf);
4804                opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
4805             } else {
4806                opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
4807             }
4808             break;
4809          case 12:
4810             assert(ctx->options->chip_class >= GFX7 ||
4811                    (!use_mubuf && ctx->options->chip_class == GFX6));
4812             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
4813             break;
4814          case 16:
4815             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
4816             break;
4817          default:
4818             unreachable("Unimplemented load_input vector size");
4819          }
4820
4821          Temp fetch_dst;
4822          if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
4823              !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
4824                            num_channels <= 3)) {
4825             direct_fetch = true;
4826             fetch_dst = dst;
4827          } else {
4828             fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
4829          }
4830
4831          if (use_mubuf) {
4832             Instruction *mubuf = bld.mubuf(opcode,
4833                                            Definition(fetch_dst), list, fetch_index, soffset,
4834                                            fetch_offset, false, true).instr;
4835             static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
4836          } else {
4837             Instruction *mtbuf = bld.mtbuf(opcode,
4838                                            Definition(fetch_dst), list, fetch_index, soffset,
4839                                            fetch_dfmt, nfmt, fetch_offset, false, true).instr;
4840             static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
4841          }
4842
4843          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
4844
4845          if (fetch_component == 1) {
4846             channels[channel_start] = fetch_dst;
4847          } else {
4848             for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
4849                channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
4850                                                                  bitsize == 16 ? v2b : v1);
4851          }
4852
4853          channel_start += fetch_component;
4854       }
4855
4856       if (!direct_fetch) {
4857          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
4858                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
4859
4860          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
4861          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
4862          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
4863
4864          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4865          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4866          unsigned num_temp = 0;
4867          for (unsigned i = 0; i < dst.size(); i++) {
4868             unsigned idx = i + component;
4869             if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) {
4870                Temp channel = channels[swizzle[idx]];
4871                if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE)
4872                   channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel);
4873                vec->operands[i] = Operand(channel);
4874
4875                num_temp++;
4876                elems[i] = channel;
4877             } else if (is_float && idx == 3) {
4878                vec->operands[i] = Operand(0x3f800000u);
4879             } else if (!is_float && idx == 3) {
4880                vec->operands[i] = Operand(1u);
4881             } else {
4882                vec->operands[i] = Operand(0u);
4883             }
4884          }
4885          vec->definitions[0] = Definition(dst);
4886          ctx->block->instructions.emplace_back(std::move(vec));
4887          emit_split_vector(ctx, dst, dst.size());
4888
4889          if (num_temp == dst.size())
4890             ctx->allocated_vec.emplace(dst.id(), elems);
4891       }
4892    } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) {
4893       unsigned offset_idx = instr->intrinsic == nir_intrinsic_load_input ? 0 : 1;
4894       nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
4895       if (off_instr->type != nir_instr_type_load_const ||
4896           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
4897          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
4898          nir_print_instr(off_instr, stderr);
4899          fprintf(stderr, "\n");
4900       }
4901
4902       Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
4903       nir_const_value* offset = nir_src_as_const_value(instr->src[offset_idx]);
4904       if (offset) {
4905          assert(offset->u32 == 0);
4906       } else {
4907          /* the lower 15bit of the prim_mask contain the offset into LDS
4908           * while the upper bits contain the number of prims */
4909          Temp offset_src = get_ssa_temp(ctx, instr->src[offset_idx].ssa);
4910          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
4911          Builder bld(ctx->program, ctx->block);
4912          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
4913          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
4914          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
4915          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
4916          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
4917       }
4918
4919       unsigned idx = nir_intrinsic_base(instr);
4920       unsigned component = nir_intrinsic_component(instr);
4921       unsigned vertex_id = 2; /* P0 */
4922
4923       if (instr->intrinsic == nir_intrinsic_load_input_vertex) {
4924          nir_const_value* src0 = nir_src_as_const_value(instr->src[0]);
4925          switch (src0->u32) {
4926          case 0:
4927             vertex_id = 2; /* P0 */
4928             break;
4929          case 1:
4930             vertex_id = 0; /* P10 */
4931             break;
4932          case 2:
4933             vertex_id = 1; /* P20 */
4934             break;
4935          default:
4936             unreachable("invalid vertex index");
4937          }
4938       }
4939
4940       if (dst.size() == 1) {
4941          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(vertex_id), bld.m0(prim_mask), idx, component);
4942       } else {
4943          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
4944          for (unsigned i = 0; i < dst.size(); i++)
4945             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(vertex_id), bld.m0(prim_mask), idx, component + i);
4946          vec->definitions[0] = Definition(dst);
4947          bld.insert(std::move(vec));
4948       }
4949
4950    } else if (ctx->shader->info.stage == MESA_SHADER_TESS_EVAL) {
4951       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
4952       Temp soffset = get_arg(ctx, ctx->args->oc_lds);
4953       std::pair<Temp, unsigned> offs = get_tcs_per_patch_output_vmem_offset(ctx, instr);
4954       unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8u;
4955
4956       load_vmem_mubuf(ctx, dst, ring, offs.first, soffset, offs.second, elem_size_bytes, instr->dest.ssa.num_components);
4957    } else {
4958       unreachable("Shader stage not implemented");
4959    }
4960 }
4961
4962 std::pair<Temp, unsigned> get_gs_per_vertex_input_offset(isel_context *ctx, nir_intrinsic_instr *instr, unsigned base_stride = 1u)
4963 {
4964    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
4965
4966    Builder bld(ctx->program, ctx->block);
4967    nir_src *vertex_src = nir_get_io_vertex_index_src(instr);
4968    Temp vertex_offset;
4969
4970    if (!nir_src_is_const(*vertex_src)) {
4971       /* better code could be created, but this case probably doesn't happen
4972        * much in practice */
4973       Temp indirect_vertex = as_vgpr(ctx, get_ssa_temp(ctx, vertex_src->ssa));
4974       for (unsigned i = 0; i < ctx->shader->info.gs.vertices_in; i++) {
4975          Temp elem;
4976
4977          if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
4978             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i / 2u * 2u]);
4979             if (i % 2u)
4980                elem = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), elem);
4981          } else {
4982             elem = get_arg(ctx, ctx->args->gs_vtx_offset[i]);
4983          }
4984
4985          if (vertex_offset.id()) {
4986             Temp cond = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)),
4987                                  Operand(i), indirect_vertex);
4988             vertex_offset = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), vertex_offset, elem, cond);
4989          } else {
4990             vertex_offset = elem;
4991          }
4992       }
4993
4994       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
4995          vertex_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu), vertex_offset);
4996    } else {
4997       unsigned vertex = nir_src_as_uint(*vertex_src);
4998       if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs)
4999          vertex_offset = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
5000                                   get_arg(ctx, ctx->args->gs_vtx_offset[vertex / 2u * 2u]),
5001                                   Operand((vertex % 2u) * 16u), Operand(16u));
5002       else
5003          vertex_offset = get_arg(ctx, ctx->args->gs_vtx_offset[vertex]);
5004    }
5005
5006    std::pair<Temp, unsigned> offs = get_intrinsic_io_basic_offset(ctx, instr, base_stride);
5007    offs = offset_add(ctx, offs, std::make_pair(vertex_offset, 0u));
5008    return offset_mul(ctx, offs, 4u);
5009 }
5010
5011 void visit_load_gs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5012 {
5013    assert(ctx->shader->info.stage == MESA_SHADER_GEOMETRY);
5014
5015    Builder bld(ctx->program, ctx->block);
5016    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5017    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5018
5019    if (ctx->stage == geometry_gs) {
5020       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr, ctx->program->wave_size);
5021       Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_GS * 16u));
5022       load_vmem_mubuf(ctx, dst, ring, offs.first, Temp(), offs.second, elem_size_bytes, instr->dest.ssa.num_components, 4u * ctx->program->wave_size, false, true);
5023    } else if (ctx->stage == vertex_geometry_gs || ctx->stage == tess_eval_geometry_gs) {
5024       std::pair<Temp, unsigned> offs = get_gs_per_vertex_input_offset(ctx, instr);
5025       unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
5026       load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
5027    } else {
5028       unreachable("Unsupported GS stage.");
5029    }
5030 }
5031
5032 void visit_load_tcs_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5033 {
5034    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5035
5036    Builder bld(ctx->program, ctx->block);
5037    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5038
5039    if (load_input_from_temps(ctx, instr, dst))
5040       return;
5041
5042    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_input_lds_offset(ctx, instr);
5043    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5044    unsigned lds_align = calculate_lds_alignment(ctx, offs.second);
5045
5046    load_lds(ctx, elem_size_bytes, dst, offs.first, offs.second, lds_align);
5047 }
5048
5049 void visit_load_tes_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5050 {
5051    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5052
5053    Builder bld(ctx->program, ctx->block);
5054
5055    Temp ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
5056    Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
5057    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5058
5059    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
5060    std::pair<Temp, unsigned> offs = get_tcs_per_vertex_output_vmem_offset(ctx, instr);
5061
5062    load_vmem_mubuf(ctx, dst, ring, offs.first, oc_lds, offs.second, elem_size_bytes, instr->dest.ssa.num_components, 0u, true, true);
5063 }
5064
5065 void visit_load_per_vertex_input(isel_context *ctx, nir_intrinsic_instr *instr)
5066 {
5067    switch (ctx->shader->info.stage) {
5068    case MESA_SHADER_GEOMETRY:
5069       visit_load_gs_per_vertex_input(ctx, instr);
5070       break;
5071    case MESA_SHADER_TESS_CTRL:
5072       visit_load_tcs_per_vertex_input(ctx, instr);
5073       break;
5074    case MESA_SHADER_TESS_EVAL:
5075       visit_load_tes_per_vertex_input(ctx, instr);
5076       break;
5077    default:
5078       unreachable("Unimplemented shader stage");
5079    }
5080 }
5081
5082 void visit_load_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5083 {
5084    visit_load_tcs_output(ctx, instr, true);
5085 }
5086
5087 void visit_store_per_vertex_output(isel_context *ctx, nir_intrinsic_instr *instr)
5088 {
5089    assert(ctx->stage == tess_control_hs || ctx->stage == vertex_tess_control_hs);
5090    assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
5091
5092    visit_store_tcs_output(ctx, instr, true);
5093 }
5094
5095 void visit_load_tess_coord(isel_context *ctx, nir_intrinsic_instr *instr)
5096 {
5097    assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
5098
5099    Builder bld(ctx->program, ctx->block);
5100    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5101
5102    Operand tes_u(get_arg(ctx, ctx->args->tes_u));
5103    Operand tes_v(get_arg(ctx, ctx->args->tes_v));
5104    Operand tes_w(0u);
5105
5106    if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) {
5107       Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v);
5108       tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0x3f800000u /* 1.0f */), tmp);
5109       tes_w = Operand(tmp);
5110    }
5111
5112    Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w);
5113    emit_split_vector(ctx, tess_coord, 3);
5114 }
5115
5116 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
5117 {
5118    if (ctx->program->info->need_indirect_descriptor_sets) {
5119       Builder bld(ctx->program, ctx->block);
5120       Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0]));
5121       Operand off = bld.copy(bld.def(s1), Operand(desc_set << 2));
5122       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off);//, false, false, false);
5123    }
5124
5125    return get_arg(ctx, ctx->args->descriptor_sets[desc_set]);
5126 }
5127
5128
5129 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
5130 {
5131    Builder bld(ctx->program, ctx->block);
5132    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
5133    if (!nir_dest_is_divergent(instr->dest))
5134       index = bld.as_uniform(index);
5135    unsigned desc_set = nir_intrinsic_desc_set(instr);
5136    unsigned binding = nir_intrinsic_binding(instr);
5137
5138    Temp desc_ptr;
5139    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
5140    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
5141    unsigned offset = layout->binding[binding].offset;
5142    unsigned stride;
5143    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
5144        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
5145       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
5146       desc_ptr = get_arg(ctx, ctx->args->ac.push_constants);
5147       offset = pipeline_layout->push_constant_size + 16 * idx;
5148       stride = 16;
5149    } else {
5150       desc_ptr = load_desc_ptr(ctx, desc_set);
5151       stride = layout->binding[binding].size;
5152    }
5153
5154    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
5155    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
5156    if (stride != 1) {
5157       if (nir_const_index) {
5158          const_index = const_index * stride;
5159       } else if (index.type() == RegType::vgpr) {
5160          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
5161          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
5162       } else {
5163          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
5164       }
5165    }
5166    if (offset) {
5167       if (nir_const_index) {
5168          const_index = const_index + offset;
5169       } else if (index.type() == RegType::vgpr) {
5170          index = bld.vadd32(bld.def(v1), Operand(offset), index);
5171       } else {
5172          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
5173       }
5174    }
5175
5176    if (nir_const_index && const_index == 0) {
5177       index = desc_ptr;
5178    } else if (index.type() == RegType::vgpr) {
5179       index = bld.vadd32(bld.def(v1),
5180                          nir_const_index ? Operand(const_index) : Operand(index),
5181                          Operand(desc_ptr));
5182    } else {
5183       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
5184                        nir_const_index ? Operand(const_index) : Operand(index),
5185                        Operand(desc_ptr));
5186    }
5187
5188    bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
5189 }
5190
5191 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
5192                  Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
5193                  bool glc=false, bool readonly=true, bool allow_smem=true)
5194 {
5195    Builder bld(ctx->program, ctx->block);
5196
5197    bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
5198    if (use_smem)
5199       offset = bld.as_uniform(offset);
5200
5201    LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
5202    info.glc = glc;
5203    info.barrier = readonly ? barrier_none : barrier_buffer;
5204    info.can_reorder = readonly;
5205    info.align_mul = align_mul;
5206    info.align_offset = align_offset;
5207    if (use_smem)
5208       emit_smem_load(ctx, bld, &info);
5209    else
5210       emit_mubuf_load(ctx, bld, &info);
5211 }
5212
5213 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
5214 {
5215    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5216    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
5217
5218    Builder bld(ctx->program, ctx->block);
5219
5220    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
5221    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
5222    unsigned binding = nir_intrinsic_binding(idx_instr);
5223    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
5224
5225    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
5226       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5227                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5228                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5229                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5230       if (ctx->options->chip_class >= GFX10) {
5231          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5232                       S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5233                       S_008F0C_RESOURCE_LEVEL(1);
5234       } else {
5235          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5236                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5237       }
5238       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
5239                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
5240                                      Operand(0xFFFFFFFFu),
5241                                      Operand(desc_type));
5242       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5243                         rsrc, upper_dwords);
5244    } else {
5245       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
5246       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
5247    }
5248    unsigned size = instr->dest.ssa.bit_size / 8;
5249    load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
5250                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
5251 }
5252
5253 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5254 {
5255    Builder bld(ctx->program, ctx->block);
5256    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5257    unsigned offset = nir_intrinsic_base(instr);
5258    unsigned count = instr->dest.ssa.num_components;
5259    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
5260
5261    if (index_cv && instr->dest.ssa.bit_size == 32) {
5262       unsigned start = (offset + index_cv->u32) / 4u;
5263       start -= ctx->args->ac.base_inline_push_consts;
5264       if (start + count <= ctx->args->ac.num_inline_push_consts) {
5265          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
5266          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
5267          for (unsigned i = 0; i < count; ++i) {
5268             elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]);
5269             vec->operands[i] = Operand{elems[i]};
5270          }
5271          vec->definitions[0] = Definition(dst);
5272          ctx->block->instructions.emplace_back(std::move(vec));
5273          ctx->allocated_vec.emplace(dst.id(), elems);
5274          return;
5275       }
5276    }
5277
5278    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
5279    if (offset != 0) // TODO check if index != 0 as well
5280       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
5281    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
5282    Temp vec = dst;
5283    bool trim = false;
5284    bool aligned = true;
5285
5286    if (instr->dest.ssa.bit_size == 8) {
5287       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5288       bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4);
5289       if (!aligned)
5290          vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2);
5291    } else if (instr->dest.ssa.bit_size == 16) {
5292       aligned = index_cv && (offset + index_cv->u32) % 4 == 0;
5293       if (!aligned)
5294          vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1);
5295    }
5296
5297    aco_opcode op;
5298
5299    switch (vec.size()) {
5300    case 1:
5301       op = aco_opcode::s_load_dword;
5302       break;
5303    case 2:
5304       op = aco_opcode::s_load_dwordx2;
5305       break;
5306    case 3:
5307       vec = bld.tmp(s4);
5308       trim = true;
5309    case 4:
5310       op = aco_opcode::s_load_dwordx4;
5311       break;
5312    case 6:
5313       vec = bld.tmp(s8);
5314       trim = true;
5315    case 8:
5316       op = aco_opcode::s_load_dwordx8;
5317       break;
5318    default:
5319       unreachable("unimplemented or forbidden load_push_constant.");
5320    }
5321
5322    bld.smem(op, Definition(vec), ptr, index);
5323
5324    if (!aligned) {
5325       Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
5326       byte_align_scalar(ctx, vec, byte_offset, dst);
5327       return;
5328    }
5329
5330    if (trim) {
5331       emit_split_vector(ctx, vec, 4);
5332       RegClass rc = dst.size() == 3 ? s1 : s2;
5333       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5334                  emit_extract_vector(ctx, vec, 0, rc),
5335                  emit_extract_vector(ctx, vec, 1, rc),
5336                  emit_extract_vector(ctx, vec, 2, rc));
5337
5338    }
5339    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
5340 }
5341
5342 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
5343 {
5344    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5345
5346    Builder bld(ctx->program, ctx->block);
5347
5348    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
5349                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
5350                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
5351                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
5352    if (ctx->options->chip_class >= GFX10) {
5353       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
5354                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
5355                    S_008F0C_RESOURCE_LEVEL(1);
5356    } else {
5357       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
5358                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
5359    }
5360
5361    unsigned base = nir_intrinsic_base(instr);
5362    unsigned range = nir_intrinsic_range(instr);
5363
5364    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5365    if (base && offset.type() == RegType::sgpr)
5366       offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
5367    else if (base && offset.type() == RegType::vgpr)
5368       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
5369
5370    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5371                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
5372                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
5373                           Operand(desc_type));
5374    unsigned size = instr->dest.ssa.bit_size / 8;
5375    // TODO: get alignment information for subdword constants
5376    load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
5377 }
5378
5379 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
5380 {
5381    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5382       ctx->cf_info.exec_potentially_empty_discard = true;
5383
5384    ctx->program->needs_exact = true;
5385
5386    // TODO: optimize uniform conditions
5387    Builder bld(ctx->program, ctx->block);
5388    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5389    assert(src.regClass() == bld.lm);
5390    src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
5391    bld.pseudo(aco_opcode::p_discard_if, src);
5392    ctx->block->kind |= block_kind_uses_discard_if;
5393    return;
5394 }
5395
5396 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
5397 {
5398    Builder bld(ctx->program, ctx->block);
5399
5400    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
5401       ctx->cf_info.exec_potentially_empty_discard = true;
5402
5403    bool divergent = ctx->cf_info.parent_if.is_divergent ||
5404                     ctx->cf_info.parent_loop.has_divergent_continue;
5405
5406    if (ctx->block->loop_nest_depth &&
5407        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
5408       /* we handle discards the same way as jump instructions */
5409       append_logical_end(ctx->block);
5410
5411       /* in loops, discard behaves like break */
5412       Block *linear_target = ctx->cf_info.parent_loop.exit;
5413       ctx->block->kind |= block_kind_discard;
5414
5415       if (!divergent) {
5416          /* uniform discard - loop ends here */
5417          assert(nir_instr_is_last(&instr->instr));
5418          ctx->block->kind |= block_kind_uniform;
5419          ctx->cf_info.has_branch = true;
5420          bld.branch(aco_opcode::p_branch);
5421          add_linear_edge(ctx->block->index, linear_target);
5422          return;
5423       }
5424
5425       /* we add a break right behind the discard() instructions */
5426       ctx->block->kind |= block_kind_break;
5427       unsigned idx = ctx->block->index;
5428
5429       ctx->cf_info.parent_loop.has_divergent_branch = true;
5430       ctx->cf_info.nir_to_aco[instr->instr.block->index] = idx;
5431
5432       /* remove critical edges from linear CFG */
5433       bld.branch(aco_opcode::p_branch);
5434       Block* break_block = ctx->program->create_and_insert_block();
5435       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5436       break_block->kind |= block_kind_uniform;
5437       add_linear_edge(idx, break_block);
5438       add_linear_edge(break_block->index, linear_target);
5439       bld.reset(break_block);
5440       bld.branch(aco_opcode::p_branch);
5441
5442       Block* continue_block = ctx->program->create_and_insert_block();
5443       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
5444       add_linear_edge(idx, continue_block);
5445       append_logical_start(continue_block);
5446       ctx->block = continue_block;
5447
5448       return;
5449    }
5450
5451    /* it can currently happen that NIR doesn't remove the unreachable code */
5452    if (!nir_instr_is_last(&instr->instr)) {
5453       ctx->program->needs_exact = true;
5454       /* save exec somewhere temporarily so that it doesn't get
5455        * overwritten before the discard from outer exec masks */
5456       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, bld.lm));
5457       bld.pseudo(aco_opcode::p_discard_if, cond);
5458       ctx->block->kind |= block_kind_uses_discard_if;
5459       return;
5460    }
5461
5462    /* This condition is incorrect for uniformly branched discards in a loop
5463     * predicated by a divergent condition, but the above code catches that case
5464     * and the discard would end up turning into a discard_if.
5465     * For example:
5466     * if (divergent) {
5467     *    while (...) {
5468     *       if (uniform) {
5469     *          discard;
5470     *       }
5471     *    }
5472     * }
5473     */
5474    if (!ctx->cf_info.parent_if.is_divergent) {
5475       /* program just ends here */
5476       ctx->block->kind |= block_kind_uniform;
5477       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
5478               0 /* enabled mask */, 9 /* dest */,
5479               false /* compressed */, true/* done */, true /* valid mask */);
5480       bld.sopp(aco_opcode::s_endpgm);
5481       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
5482    } else {
5483       ctx->block->kind |= block_kind_discard;
5484       /* branch and linear edge is added by visit_if() */
5485    }
5486 }
5487
5488 enum aco_descriptor_type {
5489    ACO_DESC_IMAGE,
5490    ACO_DESC_FMASK,
5491    ACO_DESC_SAMPLER,
5492    ACO_DESC_BUFFER,
5493    ACO_DESC_PLANE_0,
5494    ACO_DESC_PLANE_1,
5495    ACO_DESC_PLANE_2,
5496 };
5497
5498 static bool
5499 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
5500    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
5501       return false;
5502    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
5503    return dim == ac_image_cube ||
5504           dim == ac_image_1darray ||
5505           dim == ac_image_2darray ||
5506           dim == ac_image_2darraymsaa;
5507 }
5508
5509 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
5510                       enum aco_descriptor_type desc_type,
5511                       const nir_tex_instr *tex_instr, bool image, bool write)
5512 {
5513 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
5514    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
5515    if (it != ctx->tex_desc.end())
5516       return it->second;
5517 */
5518    Temp index = Temp();
5519    bool index_set = false;
5520    unsigned constant_index = 0;
5521    unsigned descriptor_set;
5522    unsigned base_index;
5523    Builder bld(ctx->program, ctx->block);
5524
5525    if (!deref_instr) {
5526       assert(tex_instr && !image);
5527       descriptor_set = 0;
5528       base_index = tex_instr->sampler_index;
5529    } else {
5530       while(deref_instr->deref_type != nir_deref_type_var) {
5531          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
5532          if (!array_size)
5533             array_size = 1;
5534
5535          assert(deref_instr->deref_type == nir_deref_type_array);
5536          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
5537          if (const_value) {
5538             constant_index += array_size * const_value->u32;
5539          } else {
5540             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
5541             if (indirect.type() == RegType::vgpr)
5542                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
5543
5544             if (array_size != 1)
5545                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
5546
5547             if (!index_set) {
5548                index = indirect;
5549                index_set = true;
5550             } else {
5551                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
5552             }
5553          }
5554
5555          deref_instr = nir_src_as_deref(deref_instr->parent);
5556       }
5557       descriptor_set = deref_instr->var->data.descriptor_set;
5558       base_index = deref_instr->var->data.binding;
5559    }
5560
5561    Temp list = load_desc_ptr(ctx, descriptor_set);
5562    list = convert_pointer_to_64_bit(ctx, list);
5563
5564    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
5565    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
5566    unsigned offset = binding->offset;
5567    unsigned stride = binding->size;
5568    aco_opcode opcode;
5569    RegClass type;
5570
5571    assert(base_index < layout->binding_count);
5572
5573    switch (desc_type) {
5574    case ACO_DESC_IMAGE:
5575       type = s8;
5576       opcode = aco_opcode::s_load_dwordx8;
5577       break;
5578    case ACO_DESC_FMASK:
5579       type = s8;
5580       opcode = aco_opcode::s_load_dwordx8;
5581       offset += 32;
5582       break;
5583    case ACO_DESC_SAMPLER:
5584       type = s4;
5585       opcode = aco_opcode::s_load_dwordx4;
5586       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
5587          offset += radv_combined_image_descriptor_sampler_offset(binding);
5588       break;
5589    case ACO_DESC_BUFFER:
5590       type = s4;
5591       opcode = aco_opcode::s_load_dwordx4;
5592       break;
5593    case ACO_DESC_PLANE_0:
5594    case ACO_DESC_PLANE_1:
5595       type = s8;
5596       opcode = aco_opcode::s_load_dwordx8;
5597       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
5598       break;
5599    case ACO_DESC_PLANE_2:
5600       type = s4;
5601       opcode = aco_opcode::s_load_dwordx4;
5602       offset += 64;
5603       break;
5604    default:
5605       unreachable("invalid desc_type\n");
5606    }
5607
5608    offset += constant_index * stride;
5609
5610    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
5611       (!index_set || binding->immutable_samplers_equal)) {
5612       if (binding->immutable_samplers_equal)
5613          constant_index = 0;
5614
5615       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
5616       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
5617                         Operand(samplers[constant_index * 4 + 0]),
5618                         Operand(samplers[constant_index * 4 + 1]),
5619                         Operand(samplers[constant_index * 4 + 2]),
5620                         Operand(samplers[constant_index * 4 + 3]));
5621    }
5622
5623    Operand off;
5624    if (!index_set) {
5625       off = bld.copy(bld.def(s1), Operand(offset));
5626    } else {
5627       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
5628                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
5629    }
5630
5631    Temp res = bld.smem(opcode, bld.def(type), list, off);
5632
5633    if (desc_type == ACO_DESC_PLANE_2) {
5634       Temp components[8];
5635       for (unsigned i = 0; i < 8; i++)
5636          components[i] = bld.tmp(s1);
5637       bld.pseudo(aco_opcode::p_split_vector,
5638                  Definition(components[0]),
5639                  Definition(components[1]),
5640                  Definition(components[2]),
5641                  Definition(components[3]),
5642                  res);
5643
5644       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
5645       bld.pseudo(aco_opcode::p_split_vector,
5646                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
5647                  Definition(components[4]),
5648                  Definition(components[5]),
5649                  Definition(components[6]),
5650                  Definition(components[7]),
5651                  desc2);
5652
5653       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
5654                        components[0], components[1], components[2], components[3],
5655                        components[4], components[5], components[6], components[7]);
5656    }
5657
5658    return res;
5659 }
5660
5661 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
5662 {
5663    switch (dim) {
5664    case GLSL_SAMPLER_DIM_BUF:
5665       return 1;
5666    case GLSL_SAMPLER_DIM_1D:
5667       return array ? 2 : 1;
5668    case GLSL_SAMPLER_DIM_2D:
5669       return array ? 3 : 2;
5670    case GLSL_SAMPLER_DIM_MS:
5671       return array ? 4 : 3;
5672    case GLSL_SAMPLER_DIM_3D:
5673    case GLSL_SAMPLER_DIM_CUBE:
5674       return 3;
5675    case GLSL_SAMPLER_DIM_RECT:
5676    case GLSL_SAMPLER_DIM_SUBPASS:
5677       return 2;
5678    case GLSL_SAMPLER_DIM_SUBPASS_MS:
5679       return 3;
5680    default:
5681       break;
5682    }
5683    return 0;
5684 }
5685
5686
5687 /* Adjust the sample index according to FMASK.
5688  *
5689  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
5690  * which is the identity mapping. Each nibble says which physical sample
5691  * should be fetched to get that sample.
5692  *
5693  * For example, 0x11111100 means there are only 2 samples stored and
5694  * the second sample covers 3/4 of the pixel. When reading samples 0
5695  * and 1, return physical sample 0 (determined by the first two 0s
5696  * in FMASK), otherwise return physical sample 1.
5697  *
5698  * The sample index should be adjusted as follows:
5699  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
5700  */
5701 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vector<Temp>& coords, Operand sample_index, Temp fmask_desc_ptr)
5702 {
5703    Builder bld(ctx->program, ctx->block);
5704    Temp fmask = bld.tmp(v1);
5705    unsigned dim = ctx->options->chip_class >= GFX10
5706                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
5707                   : 0;
5708
5709    Temp coord = da ? bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), coords[0], coords[1], coords[2]) :
5710                      bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), coords[0], coords[1]);
5711    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 3, 1)};
5712    load->operands[0] = Operand(fmask_desc_ptr);
5713    load->operands[1] = Operand(s4); /* no sampler */
5714    load->operands[2] = Operand(coord);
5715    load->definitions[0] = Definition(fmask);
5716    load->glc = false;
5717    load->dlc = false;
5718    load->dmask = 0x1;
5719    load->unrm = true;
5720    load->da = da;
5721    load->dim = dim;
5722    load->can_reorder = true; /* fmask images shouldn't be modified */
5723    ctx->block->instructions.emplace_back(std::move(load));
5724
5725    Operand sample_index4;
5726    if (sample_index.isConstant()) {
5727       if (sample_index.constantValue() < 16) {
5728          sample_index4 = Operand(sample_index.constantValue() << 2);
5729       } else {
5730          sample_index4 = Operand(0u);
5731       }
5732    } else if (sample_index.regClass() == s1) {
5733       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
5734    } else {
5735       assert(sample_index.regClass() == v1);
5736       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
5737    }
5738
5739    Temp final_sample;
5740    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
5741       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
5742    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
5743       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
5744    else
5745       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
5746
5747    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
5748     * resource descriptor is 0 (invalid),
5749     */
5750    Temp compare = bld.tmp(bld.lm);
5751    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
5752                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
5753
5754    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
5755
5756    /* Replace the MSAA sample index. */
5757    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
5758 }
5759
5760 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
5761 {
5762
5763    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
5764    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5765    bool is_array = glsl_sampler_type_is_array(type);
5766    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5767    assert(!add_frag_pos && "Input attachments should be lowered.");
5768    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
5769    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
5770    int count = image_type_to_components_count(dim, is_array);
5771    std::vector<Temp> coords(count);
5772    Builder bld(ctx->program, ctx->block);
5773
5774    if (is_ms) {
5775       count--;
5776       Temp src2 = get_ssa_temp(ctx, instr->src[2].ssa);
5777       /* get sample index */
5778       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
5779          nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
5780          Operand sample_index = sample_cv ? Operand(sample_cv->u32) : Operand(emit_extract_vector(ctx, src2, 0, v1));
5781          std::vector<Temp> fmask_load_address;
5782          for (unsigned i = 0; i < (is_array ? 3 : 2); i++)
5783             fmask_load_address.emplace_back(emit_extract_vector(ctx, src0, i, v1));
5784
5785          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
5786          coords[count] = adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr);
5787       } else {
5788          coords[count] = emit_extract_vector(ctx, src2, 0, v1);
5789       }
5790    }
5791
5792    if (gfx9_1d) {
5793       coords[0] = emit_extract_vector(ctx, src0, 0, v1);
5794       coords.resize(coords.size() + 1);
5795       coords[1] = bld.copy(bld.def(v1), Operand(0u));
5796       if (is_array)
5797          coords[2] = emit_extract_vector(ctx, src0, 1, v1);
5798    } else {
5799       for (int i = 0; i < count; i++)
5800          coords[i] = emit_extract_vector(ctx, src0, i, v1);
5801    }
5802
5803    if (instr->intrinsic == nir_intrinsic_image_deref_load ||
5804        instr->intrinsic == nir_intrinsic_image_deref_store) {
5805       int lod_index = instr->intrinsic == nir_intrinsic_image_deref_load ? 3 : 4;
5806       bool level_zero = nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0;
5807
5808       if (!level_zero)
5809          coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa));
5810    }
5811
5812    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
5813    for (unsigned i = 0; i < coords.size(); i++)
5814       vec->operands[i] = Operand(coords[i]);
5815    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
5816    vec->definitions[0] = Definition(res);
5817    ctx->block->instructions.emplace_back(std::move(vec));
5818    return res;
5819 }
5820
5821
5822 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
5823 {
5824    Builder bld(ctx->program, ctx->block);
5825    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5826    const struct glsl_type *type = glsl_without_array(var->type);
5827    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5828    bool is_array = glsl_sampler_type_is_array(type);
5829    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5830
5831    if (dim == GLSL_SAMPLER_DIM_BUF) {
5832       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
5833       unsigned num_channels = util_last_bit(mask);
5834       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5835       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5836
5837       aco_opcode opcode;
5838       switch (num_channels) {
5839       case 1:
5840          opcode = aco_opcode::buffer_load_format_x;
5841          break;
5842       case 2:
5843          opcode = aco_opcode::buffer_load_format_xy;
5844          break;
5845       case 3:
5846          opcode = aco_opcode::buffer_load_format_xyz;
5847          break;
5848       case 4:
5849          opcode = aco_opcode::buffer_load_format_xyzw;
5850          break;
5851       default:
5852          unreachable(">4 channel buffer image load");
5853       }
5854       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
5855       load->operands[0] = Operand(rsrc);
5856       load->operands[1] = Operand(vindex);
5857       load->operands[2] = Operand((uint32_t) 0);
5858       Temp tmp;
5859       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5860          tmp = dst;
5861       else
5862          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
5863       load->definitions[0] = Definition(tmp);
5864       load->idxen = true;
5865       load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
5866       load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5867       load->barrier = barrier_image;
5868       ctx->block->instructions.emplace_back(std::move(load));
5869
5870       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
5871       return;
5872    }
5873
5874    Temp coords = get_image_coords(ctx, instr, type);
5875    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5876
5877    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
5878    unsigned num_components = util_bitcount(dmask);
5879    Temp tmp;
5880    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
5881       tmp = dst;
5882    else
5883       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
5884
5885    bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
5886    aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip;
5887
5888    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1)};
5889    load->operands[0] = Operand(resource);
5890    load->operands[1] = Operand(s4); /* no sampler */
5891    load->operands[2] = Operand(coords);
5892    load->definitions[0] = Definition(tmp);
5893    load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
5894    load->dlc = load->glc && ctx->options->chip_class >= GFX10;
5895    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5896    load->dmask = dmask;
5897    load->unrm = true;
5898    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5899    load->barrier = barrier_image;
5900    ctx->block->instructions.emplace_back(std::move(load));
5901
5902    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
5903    return;
5904 }
5905
5906 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
5907 {
5908    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5909    const struct glsl_type *type = glsl_without_array(var->type);
5910    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5911    bool is_array = glsl_sampler_type_is_array(type);
5912    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5913
5914    bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
5915
5916    if (dim == GLSL_SAMPLER_DIM_BUF) {
5917       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
5918       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
5919       aco_opcode opcode;
5920       switch (data.size()) {
5921       case 1:
5922          opcode = aco_opcode::buffer_store_format_x;
5923          break;
5924       case 2:
5925          opcode = aco_opcode::buffer_store_format_xy;
5926          break;
5927       case 3:
5928          opcode = aco_opcode::buffer_store_format_xyz;
5929          break;
5930       case 4:
5931          opcode = aco_opcode::buffer_store_format_xyzw;
5932          break;
5933       default:
5934          unreachable(">4 channel buffer image store");
5935       }
5936       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
5937       store->operands[0] = Operand(rsrc);
5938       store->operands[1] = Operand(vindex);
5939       store->operands[2] = Operand((uint32_t) 0);
5940       store->operands[3] = Operand(data);
5941       store->idxen = true;
5942       store->glc = glc;
5943       store->dlc = false;
5944       store->disable_wqm = true;
5945       store->barrier = barrier_image;
5946       ctx->program->needs_exact = true;
5947       ctx->block->instructions.emplace_back(std::move(store));
5948       return;
5949    }
5950
5951    assert(data.type() == RegType::vgpr);
5952    Temp coords = get_image_coords(ctx, instr, type);
5953    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
5954
5955    bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
5956    aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
5957
5958    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 0)};
5959    store->operands[0] = Operand(resource);
5960    store->operands[1] = Operand(data);
5961    store->operands[2] = Operand(coords);
5962    store->glc = glc;
5963    store->dlc = false;
5964    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
5965    store->dmask = (1 << data.size()) - 1;
5966    store->unrm = true;
5967    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
5968    store->disable_wqm = true;
5969    store->barrier = barrier_image;
5970    ctx->program->needs_exact = true;
5971    ctx->block->instructions.emplace_back(std::move(store));
5972    return;
5973 }
5974
5975 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
5976 {
5977    /* return the previous value if dest is ever used */
5978    bool return_previous = false;
5979    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
5980       return_previous = true;
5981       break;
5982    }
5983    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
5984       return_previous = true;
5985       break;
5986    }
5987
5988    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
5989    const struct glsl_type *type = glsl_without_array(var->type);
5990    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
5991    bool is_array = glsl_sampler_type_is_array(type);
5992    Builder bld(ctx->program, ctx->block);
5993
5994    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
5995    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
5996
5997    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
5998       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
5999
6000    aco_opcode buf_op, image_op;
6001    switch (instr->intrinsic) {
6002       case nir_intrinsic_image_deref_atomic_add:
6003          buf_op = aco_opcode::buffer_atomic_add;
6004          image_op = aco_opcode::image_atomic_add;
6005          break;
6006       case nir_intrinsic_image_deref_atomic_umin:
6007          buf_op = aco_opcode::buffer_atomic_umin;
6008          image_op = aco_opcode::image_atomic_umin;
6009          break;
6010       case nir_intrinsic_image_deref_atomic_imin:
6011          buf_op = aco_opcode::buffer_atomic_smin;
6012          image_op = aco_opcode::image_atomic_smin;
6013          break;
6014       case nir_intrinsic_image_deref_atomic_umax:
6015          buf_op = aco_opcode::buffer_atomic_umax;
6016          image_op = aco_opcode::image_atomic_umax;
6017          break;
6018       case nir_intrinsic_image_deref_atomic_imax:
6019          buf_op = aco_opcode::buffer_atomic_smax;
6020          image_op = aco_opcode::image_atomic_smax;
6021          break;
6022       case nir_intrinsic_image_deref_atomic_and:
6023          buf_op = aco_opcode::buffer_atomic_and;
6024          image_op = aco_opcode::image_atomic_and;
6025          break;
6026       case nir_intrinsic_image_deref_atomic_or:
6027          buf_op = aco_opcode::buffer_atomic_or;
6028          image_op = aco_opcode::image_atomic_or;
6029          break;
6030       case nir_intrinsic_image_deref_atomic_xor:
6031          buf_op = aco_opcode::buffer_atomic_xor;
6032          image_op = aco_opcode::image_atomic_xor;
6033          break;
6034       case nir_intrinsic_image_deref_atomic_exchange:
6035          buf_op = aco_opcode::buffer_atomic_swap;
6036          image_op = aco_opcode::image_atomic_swap;
6037          break;
6038       case nir_intrinsic_image_deref_atomic_comp_swap:
6039          buf_op = aco_opcode::buffer_atomic_cmpswap;
6040          image_op = aco_opcode::image_atomic_cmpswap;
6041          break;
6042       default:
6043          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
6044    }
6045
6046    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6047
6048    if (dim == GLSL_SAMPLER_DIM_BUF) {
6049       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
6050       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
6051       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
6052       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6053       mubuf->operands[0] = Operand(resource);
6054       mubuf->operands[1] = Operand(vindex);
6055       mubuf->operands[2] = Operand((uint32_t)0);
6056       mubuf->operands[3] = Operand(data);
6057       if (return_previous)
6058          mubuf->definitions[0] = Definition(dst);
6059       mubuf->offset = 0;
6060       mubuf->idxen = true;
6061       mubuf->glc = return_previous;
6062       mubuf->dlc = false; /* Not needed for atomics */
6063       mubuf->disable_wqm = true;
6064       mubuf->barrier = barrier_image;
6065       ctx->program->needs_exact = true;
6066       ctx->block->instructions.emplace_back(std::move(mubuf));
6067       return;
6068    }
6069
6070    Temp coords = get_image_coords(ctx, instr, type);
6071    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
6072    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 3, return_previous ? 1 : 0)};
6073    mimg->operands[0] = Operand(resource);
6074    mimg->operands[1] = Operand(data);
6075    mimg->operands[2] = Operand(coords);
6076    if (return_previous)
6077       mimg->definitions[0] = Definition(dst);
6078    mimg->glc = return_previous;
6079    mimg->dlc = false; /* Not needed for atomics */
6080    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6081    mimg->dmask = (1 << data.size()) - 1;
6082    mimg->unrm = true;
6083    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
6084    mimg->disable_wqm = true;
6085    mimg->barrier = barrier_image;
6086    ctx->program->needs_exact = true;
6087    ctx->block->instructions.emplace_back(std::move(mimg));
6088    return;
6089 }
6090
6091 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
6092 {
6093    if (in_elements && ctx->options->chip_class == GFX8) {
6094       /* we only have to divide by 1, 2, 4, 8, 12 or 16 */
6095       Builder bld(ctx->program, ctx->block);
6096
6097       Temp size = emit_extract_vector(ctx, desc, 2, s1);
6098
6099       Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), bld.copy(bld.def(v1), Operand(0xaaaaaaabu)), size);
6100       size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.as_uniform(size_div3), Operand(1u));
6101
6102       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
6103       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
6104
6105       Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand(12u));
6106       size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12));
6107
6108       Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst;
6109       bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc),
6110                size, bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride));
6111       if (dst.type() == RegType::vgpr)
6112          bld.copy(Definition(dst), shr_dst);
6113
6114       /* TODO: we can probably calculate this faster with v_skip when stride != 12 */
6115    } else {
6116       emit_extract_vector(ctx, desc, 2, dst);
6117    }
6118 }
6119
6120 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
6121 {
6122    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
6123    const struct glsl_type *type = glsl_without_array(var->type);
6124    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
6125    bool is_array = glsl_sampler_type_is_array(type);
6126    Builder bld(ctx->program, ctx->block);
6127
6128    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
6129       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
6130       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
6131    }
6132
6133    /* LOD */
6134    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6135
6136    /* Resource */
6137    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
6138
6139    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6140
6141    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1)};
6142    mimg->operands[0] = Operand(resource);
6143    mimg->operands[1] = Operand(s4); /* no sampler */
6144    mimg->operands[2] = Operand(lod);
6145    uint8_t& dmask = mimg->dmask;
6146    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
6147    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
6148    mimg->da = glsl_sampler_type_is_array(type);
6149    mimg->can_reorder = true;
6150    Definition& def = mimg->definitions[0];
6151    ctx->block->instructions.emplace_back(std::move(mimg));
6152
6153    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
6154        glsl_sampler_type_is_array(type)) {
6155
6156       assert(instr->dest.ssa.num_components == 3);
6157       Temp tmp = {ctx->program->allocateId(), v3};
6158       def = Definition(tmp);
6159       emit_split_vector(ctx, tmp, 3);
6160
6161       /* divide 3rd value by 6 by multiplying with magic number */
6162       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6163       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
6164
6165       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
6166                  emit_extract_vector(ctx, tmp, 0, v1),
6167                  emit_extract_vector(ctx, tmp, 1, v1),
6168                  by_6);
6169
6170    } else if (ctx->options->chip_class == GFX9 &&
6171               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
6172               glsl_sampler_type_is_array(type)) {
6173       assert(instr->dest.ssa.num_components == 2);
6174       def = Definition(dst);
6175       dmask = 0x5;
6176    } else {
6177       def = Definition(dst);
6178    }
6179
6180    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
6181 }
6182
6183 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6184 {
6185    Builder bld(ctx->program, ctx->block);
6186    unsigned num_components = instr->num_components;
6187
6188    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6189    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6190    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6191
6192    unsigned access = nir_intrinsic_access(instr);
6193    bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
6194    unsigned size = instr->dest.ssa.bit_size / 8;
6195
6196    uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
6197    /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6198     * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6199     */
6200    bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
6201    allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
6202
6203    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
6204                nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
6205 }
6206
6207 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6208 {
6209    Builder bld(ctx->program, ctx->block);
6210    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6211    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6212    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6213    Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
6214
6215    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6216    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6217
6218    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6219    uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
6220    /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
6221     * TODO: this optimization is disabled for now because we still need to ensure correct ordering
6222     */
6223    bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
6224
6225    bool smem = !nir_src_is_divergent(instr->src[2]) &&
6226                ctx->options->chip_class >= GFX8 &&
6227                (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
6228                allow_smem;
6229    if (smem)
6230       offset = bld.as_uniform(offset);
6231    bool smem_nonfs = smem && ctx->stage != fragment_fs;
6232
6233    unsigned write_count = 0;
6234    Temp write_datas[32];
6235    unsigned offsets[32];
6236    split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
6237                       data, writemask, 16, &write_count, write_datas, offsets);
6238
6239    for (unsigned i = 0; i < write_count; i++) {
6240       aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
6241       if (smem && ctx->stage == fragment_fs)
6242          op = aco_opcode::p_fs_buffer_store_smem;
6243
6244       if (smem) {
6245          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
6246          store->operands[0] = Operand(rsrc);
6247          if (offsets[i]) {
6248             Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
6249                                 offset, Operand(offsets[i]));
6250             store->operands[1] = Operand(off);
6251          } else {
6252             store->operands[1] = Operand(offset);
6253          }
6254          if (op != aco_opcode::p_fs_buffer_store_smem)
6255             store->operands[1].setFixed(m0);
6256          store->operands[2] = Operand(write_datas[i]);
6257          store->glc = glc;
6258          store->dlc = false;
6259          store->disable_wqm = true;
6260          store->barrier = barrier_buffer;
6261          ctx->block->instructions.emplace_back(std::move(store));
6262          ctx->program->wb_smem_l1_on_end = true;
6263          if (op == aco_opcode::p_fs_buffer_store_smem) {
6264             ctx->block->kind |= block_kind_needs_lowering;
6265             ctx->program->needs_exact = true;
6266          }
6267       } else {
6268          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6269          store->operands[0] = Operand(rsrc);
6270          store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6271          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6272          store->operands[3] = Operand(write_datas[i]);
6273          store->offset = offsets[i];
6274          store->offen = (offset.type() == RegType::vgpr);
6275          store->glc = glc;
6276          store->dlc = false;
6277          store->disable_wqm = true;
6278          store->barrier = barrier_buffer;
6279          ctx->program->needs_exact = true;
6280          ctx->block->instructions.emplace_back(std::move(store));
6281       }
6282    }
6283 }
6284
6285 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
6286 {
6287    /* return the previous value if dest is ever used */
6288    bool return_previous = false;
6289    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6290       return_previous = true;
6291       break;
6292    }
6293    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6294       return_previous = true;
6295       break;
6296    }
6297
6298    Builder bld(ctx->program, ctx->block);
6299    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
6300
6301    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
6302       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6303                         get_ssa_temp(ctx, instr->src[3].ssa), data);
6304
6305    Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
6306    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6307    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
6308
6309    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6310
6311    aco_opcode op32, op64;
6312    switch (instr->intrinsic) {
6313       case nir_intrinsic_ssbo_atomic_add:
6314          op32 = aco_opcode::buffer_atomic_add;
6315          op64 = aco_opcode::buffer_atomic_add_x2;
6316          break;
6317       case nir_intrinsic_ssbo_atomic_imin:
6318          op32 = aco_opcode::buffer_atomic_smin;
6319          op64 = aco_opcode::buffer_atomic_smin_x2;
6320          break;
6321       case nir_intrinsic_ssbo_atomic_umin:
6322          op32 = aco_opcode::buffer_atomic_umin;
6323          op64 = aco_opcode::buffer_atomic_umin_x2;
6324          break;
6325       case nir_intrinsic_ssbo_atomic_imax:
6326          op32 = aco_opcode::buffer_atomic_smax;
6327          op64 = aco_opcode::buffer_atomic_smax_x2;
6328          break;
6329       case nir_intrinsic_ssbo_atomic_umax:
6330          op32 = aco_opcode::buffer_atomic_umax;
6331          op64 = aco_opcode::buffer_atomic_umax_x2;
6332          break;
6333       case nir_intrinsic_ssbo_atomic_and:
6334          op32 = aco_opcode::buffer_atomic_and;
6335          op64 = aco_opcode::buffer_atomic_and_x2;
6336          break;
6337       case nir_intrinsic_ssbo_atomic_or:
6338          op32 = aco_opcode::buffer_atomic_or;
6339          op64 = aco_opcode::buffer_atomic_or_x2;
6340          break;
6341       case nir_intrinsic_ssbo_atomic_xor:
6342          op32 = aco_opcode::buffer_atomic_xor;
6343          op64 = aco_opcode::buffer_atomic_xor_x2;
6344          break;
6345       case nir_intrinsic_ssbo_atomic_exchange:
6346          op32 = aco_opcode::buffer_atomic_swap;
6347          op64 = aco_opcode::buffer_atomic_swap_x2;
6348          break;
6349       case nir_intrinsic_ssbo_atomic_comp_swap:
6350          op32 = aco_opcode::buffer_atomic_cmpswap;
6351          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6352          break;
6353       default:
6354          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
6355    }
6356    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6357    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6358    mubuf->operands[0] = Operand(rsrc);
6359    mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
6360    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
6361    mubuf->operands[3] = Operand(data);
6362    if (return_previous)
6363       mubuf->definitions[0] = Definition(dst);
6364    mubuf->offset = 0;
6365    mubuf->offen = (offset.type() == RegType::vgpr);
6366    mubuf->glc = return_previous;
6367    mubuf->dlc = false; /* Not needed for atomics */
6368    mubuf->disable_wqm = true;
6369    mubuf->barrier = barrier_buffer;
6370    ctx->program->needs_exact = true;
6371    ctx->block->instructions.emplace_back(std::move(mubuf));
6372 }
6373
6374 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
6375
6376    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6377    Builder bld(ctx->program, ctx->block);
6378    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
6379    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
6380 }
6381
6382 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
6383 {
6384    Builder bld(ctx->program, ctx->block);
6385    unsigned num_components = instr->num_components;
6386    unsigned component_size = instr->dest.ssa.bit_size / 8;
6387
6388    LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
6389                         get_ssa_temp(ctx, &instr->dest.ssa),
6390                         num_components, component_size};
6391    info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
6392    info.align_mul = nir_intrinsic_align_mul(instr);
6393    info.align_offset = nir_intrinsic_align_offset(instr);
6394    info.barrier = barrier_buffer;
6395    info.can_reorder = false;
6396    /* VMEM stores don't update the SMEM cache and it's difficult to prove that
6397     * it's safe to use SMEM */
6398    bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
6399    if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
6400       emit_global_load(ctx, bld, &info);
6401    } else {
6402       info.offset = Operand(bld.as_uniform(info.offset));
6403       emit_smem_load(ctx, bld, &info);
6404    }
6405 }
6406
6407 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
6408 {
6409    Builder bld(ctx->program, ctx->block);
6410    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6411    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6412
6413    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6414    Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
6415    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
6416
6417    if (ctx->options->chip_class >= GFX7)
6418       addr = as_vgpr(ctx, addr);
6419
6420    unsigned write_count = 0;
6421    Temp write_datas[32];
6422    unsigned offsets[32];
6423    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6424                       16, &write_count, write_datas, offsets);
6425
6426    for (unsigned i = 0; i < write_count; i++) {
6427       if (ctx->options->chip_class >= GFX7) {
6428          unsigned offset = offsets[i];
6429          Temp store_addr = addr;
6430          if (offset > 0 && ctx->options->chip_class < GFX9) {
6431             Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
6432             Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
6433             Temp carry = bld.tmp(bld.lm);
6434             bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
6435
6436             bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
6437                      Operand(offset), addr0);
6438             bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm),
6439                      Operand(0u), addr1,
6440                      carry).def(1).setHint(vcc);
6441
6442             store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
6443
6444             offset = 0;
6445          }
6446
6447          bool global = ctx->options->chip_class >= GFX9;
6448          aco_opcode op;
6449          switch (write_datas[i].bytes()) {
6450          case 1:
6451             op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
6452             break;
6453          case 2:
6454             op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
6455             break;
6456          case 4:
6457             op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
6458             break;
6459          case 8:
6460             op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
6461             break;
6462          case 12:
6463             op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
6464             break;
6465          case 16:
6466             op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
6467             break;
6468          default:
6469             unreachable("store_global not implemented for this size.");
6470          }
6471
6472          aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
6473          flat->operands[0] = Operand(store_addr);
6474          flat->operands[1] = Operand(s1);
6475          flat->operands[2] = Operand(write_datas[i]);
6476          flat->glc = glc;
6477          flat->dlc = false;
6478          flat->offset = offset;
6479          flat->disable_wqm = true;
6480          flat->barrier = barrier_buffer;
6481          ctx->program->needs_exact = true;
6482          ctx->block->instructions.emplace_back(std::move(flat));
6483       } else {
6484          assert(ctx->options->chip_class == GFX6);
6485
6486          aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6487
6488          Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6489
6490          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
6491          mubuf->operands[0] = Operand(rsrc);
6492          mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6493          mubuf->operands[2] = Operand(0u);
6494          mubuf->operands[3] = Operand(write_datas[i]);
6495          mubuf->glc = glc;
6496          mubuf->dlc = false;
6497          mubuf->offset = offsets[i];
6498          mubuf->addr64 = addr.type() == RegType::vgpr;
6499          mubuf->disable_wqm = true;
6500          mubuf->barrier = barrier_buffer;
6501          ctx->program->needs_exact = true;
6502          ctx->block->instructions.emplace_back(std::move(mubuf));
6503       }
6504    }
6505 }
6506
6507 void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6508 {
6509    /* return the previous value if dest is ever used */
6510    bool return_previous = false;
6511    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6512       return_previous = true;
6513       break;
6514    }
6515    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6516       return_previous = true;
6517       break;
6518    }
6519
6520    Builder bld(ctx->program, ctx->block);
6521    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
6522    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6523
6524    if (ctx->options->chip_class >= GFX7)
6525       addr = as_vgpr(ctx, addr);
6526
6527    if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap)
6528       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
6529                         get_ssa_temp(ctx, instr->src[2].ssa), data);
6530
6531    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6532
6533    aco_opcode op32, op64;
6534
6535    if (ctx->options->chip_class >= GFX7) {
6536       bool global = ctx->options->chip_class >= GFX9;
6537       switch (instr->intrinsic) {
6538          case nir_intrinsic_global_atomic_add:
6539             op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add;
6540             op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2;
6541             break;
6542          case nir_intrinsic_global_atomic_imin:
6543             op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin;
6544             op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2;
6545             break;
6546          case nir_intrinsic_global_atomic_umin:
6547             op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin;
6548             op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2;
6549             break;
6550          case nir_intrinsic_global_atomic_imax:
6551             op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax;
6552             op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2;
6553             break;
6554          case nir_intrinsic_global_atomic_umax:
6555             op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax;
6556             op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2;
6557             break;
6558          case nir_intrinsic_global_atomic_and:
6559             op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and;
6560             op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2;
6561             break;
6562          case nir_intrinsic_global_atomic_or:
6563             op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or;
6564             op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2;
6565             break;
6566          case nir_intrinsic_global_atomic_xor:
6567             op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor;
6568             op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2;
6569             break;
6570          case nir_intrinsic_global_atomic_exchange:
6571             op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap;
6572             op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2;
6573             break;
6574          case nir_intrinsic_global_atomic_comp_swap:
6575             op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap;
6576             op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2;
6577             break;
6578          default:
6579             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6580       }
6581
6582       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6583       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)};
6584       flat->operands[0] = Operand(addr);
6585       flat->operands[1] = Operand(s1);
6586       flat->operands[2] = Operand(data);
6587       if (return_previous)
6588          flat->definitions[0] = Definition(dst);
6589       flat->glc = return_previous;
6590       flat->dlc = false; /* Not needed for atomics */
6591       flat->offset = 0;
6592       flat->disable_wqm = true;
6593       flat->barrier = barrier_buffer;
6594       ctx->program->needs_exact = true;
6595       ctx->block->instructions.emplace_back(std::move(flat));
6596    } else {
6597       assert(ctx->options->chip_class == GFX6);
6598
6599       switch (instr->intrinsic) {
6600          case nir_intrinsic_global_atomic_add:
6601             op32 = aco_opcode::buffer_atomic_add;
6602             op64 = aco_opcode::buffer_atomic_add_x2;
6603             break;
6604          case nir_intrinsic_global_atomic_imin:
6605             op32 = aco_opcode::buffer_atomic_smin;
6606             op64 = aco_opcode::buffer_atomic_smin_x2;
6607             break;
6608          case nir_intrinsic_global_atomic_umin:
6609             op32 = aco_opcode::buffer_atomic_umin;
6610             op64 = aco_opcode::buffer_atomic_umin_x2;
6611             break;
6612          case nir_intrinsic_global_atomic_imax:
6613             op32 = aco_opcode::buffer_atomic_smax;
6614             op64 = aco_opcode::buffer_atomic_smax_x2;
6615             break;
6616          case nir_intrinsic_global_atomic_umax:
6617             op32 = aco_opcode::buffer_atomic_umax;
6618             op64 = aco_opcode::buffer_atomic_umax_x2;
6619             break;
6620          case nir_intrinsic_global_atomic_and:
6621             op32 = aco_opcode::buffer_atomic_and;
6622             op64 = aco_opcode::buffer_atomic_and_x2;
6623             break;
6624          case nir_intrinsic_global_atomic_or:
6625             op32 = aco_opcode::buffer_atomic_or;
6626             op64 = aco_opcode::buffer_atomic_or_x2;
6627             break;
6628          case nir_intrinsic_global_atomic_xor:
6629             op32 = aco_opcode::buffer_atomic_xor;
6630             op64 = aco_opcode::buffer_atomic_xor_x2;
6631             break;
6632          case nir_intrinsic_global_atomic_exchange:
6633             op32 = aco_opcode::buffer_atomic_swap;
6634             op64 = aco_opcode::buffer_atomic_swap_x2;
6635             break;
6636          case nir_intrinsic_global_atomic_comp_swap:
6637             op32 = aco_opcode::buffer_atomic_cmpswap;
6638             op64 = aco_opcode::buffer_atomic_cmpswap_x2;
6639             break;
6640          default:
6641             unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* instructions.");
6642       }
6643
6644       Temp rsrc = get_gfx6_global_rsrc(bld, addr);
6645
6646       aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
6647
6648       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
6649       mubuf->operands[0] = Operand(rsrc);
6650       mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
6651       mubuf->operands[2] = Operand(0u);
6652       mubuf->operands[3] = Operand(data);
6653       if (return_previous)
6654          mubuf->definitions[0] = Definition(dst);
6655       mubuf->glc = return_previous;
6656       mubuf->dlc = false;
6657       mubuf->offset = 0;
6658       mubuf->addr64 = addr.type() == RegType::vgpr;
6659       mubuf->disable_wqm = true;
6660       mubuf->barrier = barrier_buffer;
6661       ctx->program->needs_exact = true;
6662       ctx->block->instructions.emplace_back(std::move(mubuf));
6663    }
6664 }
6665
6666 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
6667    Builder bld(ctx->program, ctx->block);
6668    switch(instr->intrinsic) {
6669       case nir_intrinsic_group_memory_barrier:
6670       case nir_intrinsic_memory_barrier:
6671          bld.barrier(aco_opcode::p_memory_barrier_common);
6672          break;
6673       case nir_intrinsic_memory_barrier_buffer:
6674          bld.barrier(aco_opcode::p_memory_barrier_buffer);
6675          break;
6676       case nir_intrinsic_memory_barrier_image:
6677          bld.barrier(aco_opcode::p_memory_barrier_image);
6678          break;
6679       case nir_intrinsic_memory_barrier_tcs_patch:
6680       case nir_intrinsic_memory_barrier_shared:
6681          bld.barrier(aco_opcode::p_memory_barrier_shared);
6682          break;
6683       default:
6684          unreachable("Unimplemented memory barrier intrinsic");
6685          break;
6686    }
6687 }
6688
6689 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6690 {
6691    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
6692    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6693    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6694    Builder bld(ctx->program, ctx->block);
6695
6696    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
6697    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6698    load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
6699 }
6700
6701 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
6702 {
6703    unsigned writemask = nir_intrinsic_write_mask(instr);
6704    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
6705    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6706    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6707
6708    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
6709    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
6710 }
6711
6712 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
6713 {
6714    unsigned offset = nir_intrinsic_base(instr);
6715    Builder bld(ctx->program, ctx->block);
6716    Operand m = load_lds_size_m0(bld);
6717    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6718    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6719
6720    unsigned num_operands = 3;
6721    aco_opcode op32, op64, op32_rtn, op64_rtn;
6722    switch(instr->intrinsic) {
6723       case nir_intrinsic_shared_atomic_add:
6724          op32 = aco_opcode::ds_add_u32;
6725          op64 = aco_opcode::ds_add_u64;
6726          op32_rtn = aco_opcode::ds_add_rtn_u32;
6727          op64_rtn = aco_opcode::ds_add_rtn_u64;
6728          break;
6729       case nir_intrinsic_shared_atomic_imin:
6730          op32 = aco_opcode::ds_min_i32;
6731          op64 = aco_opcode::ds_min_i64;
6732          op32_rtn = aco_opcode::ds_min_rtn_i32;
6733          op64_rtn = aco_opcode::ds_min_rtn_i64;
6734          break;
6735       case nir_intrinsic_shared_atomic_umin:
6736          op32 = aco_opcode::ds_min_u32;
6737          op64 = aco_opcode::ds_min_u64;
6738          op32_rtn = aco_opcode::ds_min_rtn_u32;
6739          op64_rtn = aco_opcode::ds_min_rtn_u64;
6740          break;
6741       case nir_intrinsic_shared_atomic_imax:
6742          op32 = aco_opcode::ds_max_i32;
6743          op64 = aco_opcode::ds_max_i64;
6744          op32_rtn = aco_opcode::ds_max_rtn_i32;
6745          op64_rtn = aco_opcode::ds_max_rtn_i64;
6746          break;
6747       case nir_intrinsic_shared_atomic_umax:
6748          op32 = aco_opcode::ds_max_u32;
6749          op64 = aco_opcode::ds_max_u64;
6750          op32_rtn = aco_opcode::ds_max_rtn_u32;
6751          op64_rtn = aco_opcode::ds_max_rtn_u64;
6752          break;
6753       case nir_intrinsic_shared_atomic_and:
6754          op32 = aco_opcode::ds_and_b32;
6755          op64 = aco_opcode::ds_and_b64;
6756          op32_rtn = aco_opcode::ds_and_rtn_b32;
6757          op64_rtn = aco_opcode::ds_and_rtn_b64;
6758          break;
6759       case nir_intrinsic_shared_atomic_or:
6760          op32 = aco_opcode::ds_or_b32;
6761          op64 = aco_opcode::ds_or_b64;
6762          op32_rtn = aco_opcode::ds_or_rtn_b32;
6763          op64_rtn = aco_opcode::ds_or_rtn_b64;
6764          break;
6765       case nir_intrinsic_shared_atomic_xor:
6766          op32 = aco_opcode::ds_xor_b32;
6767          op64 = aco_opcode::ds_xor_b64;
6768          op32_rtn = aco_opcode::ds_xor_rtn_b32;
6769          op64_rtn = aco_opcode::ds_xor_rtn_b64;
6770          break;
6771       case nir_intrinsic_shared_atomic_exchange:
6772          op32 = aco_opcode::ds_write_b32;
6773          op64 = aco_opcode::ds_write_b64;
6774          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
6775          op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
6776          break;
6777       case nir_intrinsic_shared_atomic_comp_swap:
6778          op32 = aco_opcode::ds_cmpst_b32;
6779          op64 = aco_opcode::ds_cmpst_b64;
6780          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
6781          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
6782          num_operands = 4;
6783          break;
6784       default:
6785          unreachable("Unhandled shared atomic intrinsic");
6786    }
6787
6788    /* return the previous value if dest is ever used */
6789    bool return_previous = false;
6790    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
6791       return_previous = true;
6792       break;
6793    }
6794    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
6795       return_previous = true;
6796       break;
6797    }
6798
6799    aco_opcode op;
6800    if (data.size() == 1) {
6801       assert(instr->dest.ssa.bit_size == 32);
6802       op = return_previous ? op32_rtn : op32;
6803    } else {
6804       assert(instr->dest.ssa.bit_size == 64);
6805       op = return_previous ? op64_rtn : op64;
6806    }
6807
6808    if (offset > 65535) {
6809       address = bld.vadd32(bld.def(v1), Operand(offset), address);
6810       offset = 0;
6811    }
6812
6813    aco_ptr<DS_instruction> ds;
6814    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
6815    ds->operands[0] = Operand(address);
6816    ds->operands[1] = Operand(data);
6817    if (num_operands == 4)
6818       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
6819    ds->operands[num_operands - 1] = m;
6820    ds->offset0 = offset;
6821    if (return_previous)
6822       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
6823    ctx->block->instructions.emplace_back(std::move(ds));
6824 }
6825
6826 Temp get_scratch_resource(isel_context *ctx)
6827 {
6828    Builder bld(ctx->program, ctx->block);
6829    Temp scratch_addr = ctx->program->private_segment_buffer;
6830    if (ctx->stage != compute_cs)
6831       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
6832
6833    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
6834                         S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
6835
6836    if (ctx->program->chip_class >= GFX10) {
6837       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
6838                    S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) |
6839                    S_008F0C_RESOURCE_LEVEL(1);
6840    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
6841       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
6842                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
6843    }
6844
6845    /* older generations need element size = 16 bytes. element size removed in GFX9 */
6846    if (ctx->program->chip_class <= GFX8)
6847       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
6848
6849    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
6850 }
6851
6852 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6853    Builder bld(ctx->program, ctx->block);
6854    Temp rsrc = get_scratch_resource(ctx);
6855    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6856    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6857
6858    LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
6859                         instr->dest.ssa.bit_size / 8u, rsrc};
6860    info.align_mul = nir_intrinsic_align_mul(instr);
6861    info.align_offset = nir_intrinsic_align_offset(instr);
6862    info.swizzle_component_size = 16;
6863    info.can_reorder = false;
6864    info.soffset = ctx->program->scratch_offset;
6865    emit_mubuf_load(ctx, bld, &info);
6866 }
6867
6868 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
6869    Builder bld(ctx->program, ctx->block);
6870    Temp rsrc = get_scratch_resource(ctx);
6871    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6872    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
6873
6874    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
6875    unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
6876
6877    unsigned write_count = 0;
6878    Temp write_datas[32];
6879    unsigned offsets[32];
6880    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
6881                       16, &write_count, write_datas, offsets);
6882
6883    for (unsigned i = 0; i < write_count; i++) {
6884       aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
6885       bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
6886    }
6887 }
6888
6889 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
6890    uint8_t log2_ps_iter_samples;
6891    if (ctx->program->info->ps.force_persample) {
6892       log2_ps_iter_samples =
6893          util_logbase2(ctx->options->key.fs.num_samples);
6894    } else {
6895       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
6896    }
6897
6898    /* The bit pattern matches that used by fixed function fragment
6899     * processing. */
6900    static const unsigned ps_iter_masks[] = {
6901       0xffff, /* not used */
6902       0x5555,
6903       0x1111,
6904       0x0101,
6905       0x0001,
6906    };
6907    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
6908
6909    Builder bld(ctx->program, ctx->block);
6910
6911    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
6912                              get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
6913    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
6914    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
6915    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6916    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, get_arg(ctx, ctx->args->ac.sample_coverage));
6917 }
6918
6919 void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *instr) {
6920    Builder bld(ctx->program, ctx->block);
6921
6922    unsigned stream = nir_intrinsic_stream_id(instr);
6923    Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
6924    next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u);
6925    nir_const_value *next_vertex_cv = nir_src_as_const_value(instr->src[0]);
6926
6927    /* get GSVS ring */
6928    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_GSVS_GS * 16u));
6929
6930    unsigned num_components =
6931       ctx->program->info->gs.num_stream_output_components[stream];
6932    assert(num_components);
6933
6934    unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out;
6935    unsigned stream_offset = 0;
6936    for (unsigned i = 0; i < stream; i++) {
6937       unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * ctx->shader->info.gs.vertices_out;
6938       stream_offset += prev_stride * ctx->program->wave_size;
6939    }
6940
6941    /* Limit on the stride field for <= GFX7. */
6942    assert(stride < (1 << 14));
6943
6944    Temp gsvs_dwords[4];
6945    for (unsigned i = 0; i < 4; i++)
6946       gsvs_dwords[i] = bld.tmp(s1);
6947    bld.pseudo(aco_opcode::p_split_vector,
6948               Definition(gsvs_dwords[0]),
6949               Definition(gsvs_dwords[1]),
6950               Definition(gsvs_dwords[2]),
6951               Definition(gsvs_dwords[3]),
6952               gsvs_ring);
6953
6954    if (stream_offset) {
6955       Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand(stream_offset));
6956
6957       Temp carry = bld.tmp(s1);
6958       gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), gsvs_dwords[0], stream_offset_tmp);
6959       gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(0u), bld.scc(carry));
6960    }
6961
6962    gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], Operand(S_008F04_STRIDE(stride)));
6963    gsvs_dwords[2] = bld.copy(bld.def(s1), Operand((uint32_t)ctx->program->wave_size));
6964
6965    gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
6966                           gsvs_dwords[0], gsvs_dwords[1], gsvs_dwords[2], gsvs_dwords[3]);
6967
6968    unsigned offset = 0;
6969    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) {
6970       if (ctx->program->info->gs.output_streams[i] != stream)
6971          continue;
6972
6973       for (unsigned j = 0; j < 4; j++) {
6974          if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j)))
6975             continue;
6976
6977          if (ctx->outputs.mask[i] & (1 << j)) {
6978             Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex);
6979             unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u;
6980             if (const_offset >= 4096u) {
6981                if (vaddr_offset.isUndefined())
6982                   vaddr_offset = bld.copy(bld.def(v1), Operand(const_offset / 4096u * 4096u));
6983                else
6984                   vaddr_offset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), vaddr_offset);
6985                const_offset %= 4096u;
6986             }
6987
6988             aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>(aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)};
6989             mtbuf->operands[0] = Operand(gsvs_ring);
6990             mtbuf->operands[1] = vaddr_offset;
6991             mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->gs2vs_offset));
6992             mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]);
6993             mtbuf->offen = !vaddr_offset.isUndefined();
6994             mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32;
6995             mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT;
6996             mtbuf->offset = const_offset;
6997             mtbuf->glc = true;
6998             mtbuf->slc = true;
6999             mtbuf->barrier = barrier_gs_data;
7000             mtbuf->can_reorder = true;
7001             bld.insert(std::move(mtbuf));
7002          }
7003
7004          offset += ctx->shader->info.gs.vertices_out;
7005       }
7006
7007       /* outputs for the next vertex are undefined and keeping them around can
7008        * create invalid IR with control flow */
7009       ctx->outputs.mask[i] = 0;
7010    }
7011
7012    bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream));
7013 }
7014
7015 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
7016 {
7017    Builder bld(ctx->program, ctx->block);
7018
7019    if (cluster_size == 1) {
7020       return src;
7021    } if (op == nir_op_iand && cluster_size == 4) {
7022       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
7023       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7024       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc),
7025                       bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp));
7026    } else if (op == nir_op_ior && cluster_size == 4) {
7027       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
7028       return bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc),
7029                       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)));
7030    } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) {
7031       //subgroupAnd(val) -> (exec & ~val) == 0
7032       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7033       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7034       return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond);
7035    } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) {
7036       //subgroupOr(val) -> (val & exec) != 0
7037       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)).def(1).getTemp();
7038       return bool_to_vector_condition(ctx, tmp);
7039    } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) {
7040       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
7041       Temp tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7042       tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp);
7043       tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
7044       return bool_to_vector_condition(ctx, tmp);
7045    } else {
7046       //subgroupClustered{And,Or,Xor}(val, n) ->
7047       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) ;  just v_mbcnt_lo_u32_b32 on wave32
7048       //cluster_offset = ~(n - 1) & lane_id
7049       //cluster_mask = ((1 << n) - 1)
7050       //subgroupClusteredAnd():
7051       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
7052       //subgroupClusteredOr():
7053       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
7054       //subgroupClusteredXor():
7055       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
7056       Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
7057       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
7058
7059       Temp tmp;
7060       if (op == nir_op_iand)
7061          tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7062       else
7063          tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
7064
7065       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
7066
7067       if (ctx->program->chip_class <= GFX7)
7068          tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset);
7069       else if (ctx->program->wave_size == 64)
7070          tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
7071       else
7072          tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp);
7073       tmp = emit_extract_vector(ctx, tmp, 0, v1);
7074       if (cluster_mask != 0xffffffff)
7075          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
7076
7077       Definition cmp_def = Definition();
7078       if (op == nir_op_iand) {
7079          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(cluster_mask), tmp).def(0);
7080       } else if (op == nir_op_ior) {
7081          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7082       } else if (op == nir_op_ixor) {
7083          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
7084                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
7085          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp).def(0);
7086       }
7087       cmp_def.setHint(vcc);
7088       return cmp_def.getTemp();
7089    }
7090 }
7091
7092 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
7093 {
7094    Builder bld(ctx->program, ctx->block);
7095
7096    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
7097    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
7098    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
7099    Temp tmp;
7100    if (op == nir_op_iand)
7101       tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src);
7102    else
7103       tmp = bld.sop2(Builder::s_and, bld.def(s2), bld.def(s1, scc), src, Operand(exec, bld.lm));
7104
7105    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
7106    Temp lo = lohi.def(0).getTemp();
7107    Temp hi = lohi.def(1).getTemp();
7108    Temp mbcnt = emit_mbcnt(ctx, bld.def(v1), Operand(lo), Operand(hi));
7109
7110    Definition cmp_def = Definition();
7111    if (op == nir_op_iand)
7112       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7113    else if (op == nir_op_ior)
7114       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), mbcnt).def(0);
7115    else if (op == nir_op_ixor)
7116       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u),
7117                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
7118    cmp_def.setHint(vcc);
7119    return cmp_def.getTemp();
7120 }
7121
7122 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
7123 {
7124    Builder bld(ctx->program, ctx->block);
7125
7126    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
7127    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
7128    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
7129    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
7130    if (op == nir_op_iand)
7131       return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7132    else if (op == nir_op_ior)
7133       return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7134    else if (op == nir_op_ixor)
7135       return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src);
7136
7137    assert(false);
7138    return Temp();
7139 }
7140
7141 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
7142 {
7143    Builder bld(ctx->program, ctx->block);
7144    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
7145    if (src.regClass().type() == RegType::vgpr) {
7146       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
7147    } else if (src.regClass() == s1) {
7148       bld.sop1(aco_opcode::s_mov_b32, dst, src);
7149    } else if (src.regClass() == s2) {
7150       bld.sop1(aco_opcode::s_mov_b64, dst, src);
7151    } else {
7152       fprintf(stderr, "Unimplemented NIR instr bit size: ");
7153       nir_print_instr(&instr->instr, stderr);
7154       fprintf(stderr, "\n");
7155    }
7156 }
7157
7158 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
7159 {
7160    Builder bld(ctx->program, ctx->block);
7161    Temp persp_center = get_arg(ctx, ctx->args->ac.persp_center);
7162    Temp p1 = emit_extract_vector(ctx, persp_center, 0, v1);
7163    Temp p2 = emit_extract_vector(ctx, persp_center, 1, v1);
7164
7165    Temp ddx_1, ddx_2, ddy_1, ddy_2;
7166    uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0);
7167    uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1);
7168    uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2);
7169
7170    /* Build DD X/Y */
7171    if (ctx->program->chip_class >= GFX8) {
7172       Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0);
7173       ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1);
7174       ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2);
7175       Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0);
7176       ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1);
7177       ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2);
7178    } else {
7179       Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0);
7180       ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1);
7181       ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1);
7182       ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2);
7183       ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1);
7184       Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0);
7185       ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1);
7186       ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2);
7187       ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2);
7188       ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2);
7189    }
7190
7191    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
7192    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
7193    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
7194    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
7195    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
7196    Temp wqm1 = bld.tmp(v1);
7197    emit_wqm(ctx, tmp1, wqm1, true);
7198    Temp wqm2 = bld.tmp(v1);
7199    emit_wqm(ctx, tmp2, wqm2, true);
7200    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
7201    return;
7202 }
7203
7204 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
7205 {
7206    Builder bld(ctx->program, ctx->block);
7207    switch(instr->intrinsic) {
7208    case nir_intrinsic_load_barycentric_sample:
7209    case nir_intrinsic_load_barycentric_pixel:
7210    case nir_intrinsic_load_barycentric_centroid: {
7211       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
7212       Temp bary = Temp(0, s2);
7213       switch (mode) {
7214       case INTERP_MODE_SMOOTH:
7215       case INTERP_MODE_NONE:
7216          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7217             bary = get_arg(ctx, ctx->args->ac.persp_center);
7218          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7219             bary = ctx->persp_centroid;
7220          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7221             bary = get_arg(ctx, ctx->args->ac.persp_sample);
7222          break;
7223       case INTERP_MODE_NOPERSPECTIVE:
7224          if (instr->intrinsic == nir_intrinsic_load_barycentric_pixel)
7225             bary = get_arg(ctx, ctx->args->ac.linear_center);
7226          else if (instr->intrinsic == nir_intrinsic_load_barycentric_centroid)
7227             bary = ctx->linear_centroid;
7228          else if (instr->intrinsic == nir_intrinsic_load_barycentric_sample)
7229             bary = get_arg(ctx, ctx->args->ac.linear_sample);
7230          break;
7231       default:
7232          break;
7233       }
7234       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7235       Temp p1 = emit_extract_vector(ctx, bary, 0, v1);
7236       Temp p2 = emit_extract_vector(ctx, bary, 1, v1);
7237       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7238                  Operand(p1), Operand(p2));
7239       emit_split_vector(ctx, dst, 2);
7240       break;
7241    }
7242    case nir_intrinsic_load_barycentric_model: {
7243       Temp model = get_arg(ctx, ctx->args->ac.pull_model);
7244
7245       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7246       Temp p1 = emit_extract_vector(ctx, model, 0, v1);
7247       Temp p2 = emit_extract_vector(ctx, model, 1, v1);
7248       Temp p3 = emit_extract_vector(ctx, model, 2, v1);
7249       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7250                  Operand(p1), Operand(p2), Operand(p3));
7251       emit_split_vector(ctx, dst, 3);
7252       break;
7253    }
7254    case nir_intrinsic_load_barycentric_at_sample: {
7255       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
7256       switch (ctx->options->key.fs.num_samples) {
7257          case 2: sample_pos_offset += 1 << 3; break;
7258          case 4: sample_pos_offset += 3 << 3; break;
7259          case 8: sample_pos_offset += 7 << 3; break;
7260          default: break;
7261       }
7262       Temp sample_pos;
7263       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
7264       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
7265       Temp private_segment_buffer = ctx->program->private_segment_buffer;
7266       if (addr.type() == RegType::sgpr) {
7267          Operand offset;
7268          if (const_addr) {
7269             sample_pos_offset += const_addr->u32 << 3;
7270             offset = Operand(sample_pos_offset);
7271          } else if (ctx->options->chip_class >= GFX9) {
7272             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7273          } else {
7274             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
7275             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
7276          }
7277
7278          Operand off = bld.copy(bld.def(s1), Operand(offset));
7279          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off);
7280
7281       } else if (ctx->options->chip_class >= GFX9) {
7282          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7283          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
7284       } else if (ctx->options->chip_class >= GFX7) {
7285          /* addr += private_segment_buffer + sample_pos_offset */
7286          Temp tmp0 = bld.tmp(s1);
7287          Temp tmp1 = bld.tmp(s1);
7288          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
7289          Definition scc_tmp = bld.def(s1, scc);
7290          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
7291          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));
7292          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7293          Temp pck0 = bld.tmp(v1);
7294          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
7295          tmp1 = as_vgpr(ctx, tmp1);
7296          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand(0u), carry);
7297          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
7298
7299          /* sample_pos = flat_load_dwordx2 addr */
7300          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
7301       } else {
7302          assert(ctx->options->chip_class == GFX6);
7303
7304          uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
7305                               S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
7306          Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(0u), Operand(rsrc_conf));
7307
7308          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
7309          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand(0u));
7310
7311          sample_pos = bld.tmp(v2);
7312
7313          aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)};
7314          load->definitions[0] = Definition(sample_pos);
7315          load->operands[0] = Operand(rsrc);
7316          load->operands[1] = Operand(addr);
7317          load->operands[2] = Operand(0u);
7318          load->offset = sample_pos_offset;
7319          load->offen = 0;
7320          load->addr64 = true;
7321          load->glc = false;
7322          load->dlc = false;
7323          load->disable_wqm = false;
7324          load->barrier = barrier_none;
7325          load->can_reorder = true;
7326          ctx->block->instructions.emplace_back(std::move(load));
7327       }
7328
7329       /* sample_pos -= 0.5 */
7330       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
7331       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
7332       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
7333       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
7334       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
7335
7336       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7337       break;
7338    }
7339    case nir_intrinsic_load_barycentric_at_offset: {
7340       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
7341       RegClass rc = RegClass(offset.type(), 1);
7342       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
7343       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
7344       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
7345       break;
7346    }
7347    case nir_intrinsic_load_front_face: {
7348       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7349                Operand(0u), get_arg(ctx, ctx->args->ac.front_face)).def(0).setHint(vcc);
7350       break;
7351    }
7352    case nir_intrinsic_load_view_index: {
7353       if (ctx->stage & (sw_vs | sw_gs | sw_tcs | sw_tes)) {
7354          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7355          bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index)));
7356          break;
7357       }
7358
7359       /* fallthrough */
7360    }
7361    case nir_intrinsic_load_layer_id: {
7362       unsigned idx = nir_intrinsic_base(instr);
7363       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7364                  Operand(2u), bld.m0(get_arg(ctx, ctx->args->ac.prim_mask)), idx, 0);
7365       break;
7366    }
7367    case nir_intrinsic_load_frag_coord: {
7368       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
7369       break;
7370    }
7371    case nir_intrinsic_load_sample_pos: {
7372       Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]);
7373       Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]);
7374       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7375                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
7376                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
7377       break;
7378    }
7379    case nir_intrinsic_load_tess_coord:
7380       visit_load_tess_coord(ctx, instr);
7381       break;
7382    case nir_intrinsic_load_interpolated_input:
7383       visit_load_interpolated_input(ctx, instr);
7384       break;
7385    case nir_intrinsic_store_output:
7386       visit_store_output(ctx, instr);
7387       break;
7388    case nir_intrinsic_load_input:
7389    case nir_intrinsic_load_input_vertex:
7390       visit_load_input(ctx, instr);
7391       break;
7392    case nir_intrinsic_load_output:
7393       visit_load_output(ctx, instr);
7394       break;
7395    case nir_intrinsic_load_per_vertex_input:
7396       visit_load_per_vertex_input(ctx, instr);
7397       break;
7398    case nir_intrinsic_load_per_vertex_output:
7399       visit_load_per_vertex_output(ctx, instr);
7400       break;
7401    case nir_intrinsic_store_per_vertex_output:
7402       visit_store_per_vertex_output(ctx, instr);
7403       break;
7404    case nir_intrinsic_load_ubo:
7405       visit_load_ubo(ctx, instr);
7406       break;
7407    case nir_intrinsic_load_push_constant:
7408       visit_load_push_constant(ctx, instr);
7409       break;
7410    case nir_intrinsic_load_constant:
7411       visit_load_constant(ctx, instr);
7412       break;
7413    case nir_intrinsic_vulkan_resource_index:
7414       visit_load_resource(ctx, instr);
7415       break;
7416    case nir_intrinsic_discard:
7417       visit_discard(ctx, instr);
7418       break;
7419    case nir_intrinsic_discard_if:
7420       visit_discard_if(ctx, instr);
7421       break;
7422    case nir_intrinsic_load_shared:
7423       visit_load_shared(ctx, instr);
7424       break;
7425    case nir_intrinsic_store_shared:
7426       visit_store_shared(ctx, instr);
7427       break;
7428    case nir_intrinsic_shared_atomic_add:
7429    case nir_intrinsic_shared_atomic_imin:
7430    case nir_intrinsic_shared_atomic_umin:
7431    case nir_intrinsic_shared_atomic_imax:
7432    case nir_intrinsic_shared_atomic_umax:
7433    case nir_intrinsic_shared_atomic_and:
7434    case nir_intrinsic_shared_atomic_or:
7435    case nir_intrinsic_shared_atomic_xor:
7436    case nir_intrinsic_shared_atomic_exchange:
7437    case nir_intrinsic_shared_atomic_comp_swap:
7438       visit_shared_atomic(ctx, instr);
7439       break;
7440    case nir_intrinsic_image_deref_load:
7441       visit_image_load(ctx, instr);
7442       break;
7443    case nir_intrinsic_image_deref_store:
7444       visit_image_store(ctx, instr);
7445       break;
7446    case nir_intrinsic_image_deref_atomic_add:
7447    case nir_intrinsic_image_deref_atomic_umin:
7448    case nir_intrinsic_image_deref_atomic_imin:
7449    case nir_intrinsic_image_deref_atomic_umax:
7450    case nir_intrinsic_image_deref_atomic_imax:
7451    case nir_intrinsic_image_deref_atomic_and:
7452    case nir_intrinsic_image_deref_atomic_or:
7453    case nir_intrinsic_image_deref_atomic_xor:
7454    case nir_intrinsic_image_deref_atomic_exchange:
7455    case nir_intrinsic_image_deref_atomic_comp_swap:
7456       visit_image_atomic(ctx, instr);
7457       break;
7458    case nir_intrinsic_image_deref_size:
7459       visit_image_size(ctx, instr);
7460       break;
7461    case nir_intrinsic_load_ssbo:
7462       visit_load_ssbo(ctx, instr);
7463       break;
7464    case nir_intrinsic_store_ssbo:
7465       visit_store_ssbo(ctx, instr);
7466       break;
7467    case nir_intrinsic_load_global:
7468       visit_load_global(ctx, instr);
7469       break;
7470    case nir_intrinsic_store_global:
7471       visit_store_global(ctx, instr);
7472       break;
7473    case nir_intrinsic_global_atomic_add:
7474    case nir_intrinsic_global_atomic_imin:
7475    case nir_intrinsic_global_atomic_umin:
7476    case nir_intrinsic_global_atomic_imax:
7477    case nir_intrinsic_global_atomic_umax:
7478    case nir_intrinsic_global_atomic_and:
7479    case nir_intrinsic_global_atomic_or:
7480    case nir_intrinsic_global_atomic_xor:
7481    case nir_intrinsic_global_atomic_exchange:
7482    case nir_intrinsic_global_atomic_comp_swap:
7483       visit_global_atomic(ctx, instr);
7484       break;
7485    case nir_intrinsic_ssbo_atomic_add:
7486    case nir_intrinsic_ssbo_atomic_imin:
7487    case nir_intrinsic_ssbo_atomic_umin:
7488    case nir_intrinsic_ssbo_atomic_imax:
7489    case nir_intrinsic_ssbo_atomic_umax:
7490    case nir_intrinsic_ssbo_atomic_and:
7491    case nir_intrinsic_ssbo_atomic_or:
7492    case nir_intrinsic_ssbo_atomic_xor:
7493    case nir_intrinsic_ssbo_atomic_exchange:
7494    case nir_intrinsic_ssbo_atomic_comp_swap:
7495       visit_atomic_ssbo(ctx, instr);
7496       break;
7497    case nir_intrinsic_load_scratch:
7498       visit_load_scratch(ctx, instr);
7499       break;
7500    case nir_intrinsic_store_scratch:
7501       visit_store_scratch(ctx, instr);
7502       break;
7503    case nir_intrinsic_get_buffer_size:
7504       visit_get_buffer_size(ctx, instr);
7505       break;
7506    case nir_intrinsic_control_barrier: {
7507       if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
7508          /* GFX6 only (thanks to a hw bug workaround):
7509           * The real barrier instruction isn’t needed, because an entire patch
7510           * always fits into a single wave.
7511           */
7512          break;
7513       }
7514
7515       if (ctx->program->workgroup_size > ctx->program->wave_size)
7516          bld.sopp(aco_opcode::s_barrier);
7517
7518       break;
7519    }
7520    case nir_intrinsic_memory_barrier_tcs_patch:
7521    case nir_intrinsic_group_memory_barrier:
7522    case nir_intrinsic_memory_barrier:
7523    case nir_intrinsic_memory_barrier_buffer:
7524    case nir_intrinsic_memory_barrier_image:
7525    case nir_intrinsic_memory_barrier_shared:
7526       emit_memory_barrier(ctx, instr);
7527       break;
7528    case nir_intrinsic_load_num_work_groups: {
7529       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7530       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups)));
7531       emit_split_vector(ctx, dst, 3);
7532       break;
7533    }
7534    case nir_intrinsic_load_local_invocation_id: {
7535       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7536       bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids)));
7537       emit_split_vector(ctx, dst, 3);
7538       break;
7539    }
7540    case nir_intrinsic_load_work_group_id: {
7541       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7542       struct ac_arg *args = ctx->args->ac.workgroup_ids;
7543       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
7544                  args[0].used ? Operand(get_arg(ctx, args[0])) : Operand(0u),
7545                  args[1].used ? Operand(get_arg(ctx, args[1])) : Operand(0u),
7546                  args[2].used ? Operand(get_arg(ctx, args[2])) : Operand(0u));
7547       emit_split_vector(ctx, dst, 3);
7548       break;
7549    }
7550    case nir_intrinsic_load_local_invocation_index: {
7551       Temp id = emit_mbcnt(ctx, bld.def(v1));
7552
7553       /* The tg_size bits [6:11] contain the subgroup id,
7554        * we need this multiplied by the wave size, and then OR the thread id to it.
7555        */
7556       if (ctx->program->wave_size == 64) {
7557          /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just feed that to v_or */
7558          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u),
7559                                 get_arg(ctx, ctx->args->ac.tg_size));
7560          bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
7561       } else {
7562          /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR  */
7563          Temp tg_num = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7564                                 get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7565          bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, Operand(0x5u), id);
7566       }
7567       break;
7568    }
7569    case nir_intrinsic_load_subgroup_id: {
7570       if (ctx->stage == compute_cs) {
7571          bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc),
7572                   get_arg(ctx, ctx->args->ac.tg_size), Operand(0x6u | (0x6u << 16)));
7573       } else {
7574          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
7575       }
7576       break;
7577    }
7578    case nir_intrinsic_load_subgroup_invocation: {
7579       emit_mbcnt(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
7580       break;
7581    }
7582    case nir_intrinsic_load_num_subgroups: {
7583       if (ctx->stage == compute_cs)
7584          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu),
7585                   get_arg(ctx, ctx->args->ac.tg_size));
7586       else
7587          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
7588       break;
7589    }
7590    case nir_intrinsic_ballot: {
7591       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7592       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7593       Definition tmp = bld.def(dst.regClass());
7594       Definition lanemask_tmp = dst.size() == bld.lm.size() ? tmp : bld.def(src.regClass());
7595       if (instr->src[0].ssa->bit_size == 1) {
7596          assert(src.regClass() == bld.lm);
7597          bld.sop2(Builder::s_and, lanemask_tmp, bld.def(s1, scc), Operand(exec, bld.lm), src);
7598       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
7599          bld.vopc(aco_opcode::v_cmp_lg_u32, lanemask_tmp, Operand(0u), src);
7600       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
7601          bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
7602       } else {
7603          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7604          nir_print_instr(&instr->instr, stderr);
7605          fprintf(stderr, "\n");
7606       }
7607       if (dst.size() != bld.lm.size()) {
7608          /* Wave32 with ballot size set to 64 */
7609          bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lanemask_tmp.getTemp(), Operand(0u));
7610       }
7611       emit_wqm(ctx, tmp.getTemp(), dst);
7612       break;
7613    }
7614    case nir_intrinsic_shuffle:
7615    case nir_intrinsic_read_invocation: {
7616       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7617       if (!nir_src_is_divergent(instr->src[0])) {
7618          emit_uniform_subgroup(ctx, instr, src);
7619       } else {
7620          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
7621          if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
7622             tid = bld.as_uniform(tid);
7623          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7624          if (src.regClass() == v1b || src.regClass() == v2b) {
7625             Temp tmp = bld.tmp(v1);
7626             tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp);
7627             if (dst.type() == RegType::vgpr)
7628                bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
7629             else
7630                bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
7631          } else if (src.regClass() == v1) {
7632             emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
7633          } else if (src.regClass() == v2) {
7634             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7635             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7636             lo = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, lo));
7637             hi = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, hi));
7638             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7639             emit_split_vector(ctx, dst, 2);
7640          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) {
7641             assert(src.regClass() == bld.lm);
7642             Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid);
7643             bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7644          } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) {
7645             assert(src.regClass() == bld.lm);
7646             Temp tmp;
7647             if (ctx->program->chip_class <= GFX7)
7648                tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid);
7649             else if (ctx->program->wave_size == 64)
7650                tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
7651             else
7652                tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src);
7653             tmp = emit_extract_vector(ctx, tmp, 0, v1);
7654             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
7655             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
7656          } else {
7657             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7658             nir_print_instr(&instr->instr, stderr);
7659             fprintf(stderr, "\n");
7660          }
7661       }
7662       break;
7663    }
7664    case nir_intrinsic_load_sample_id: {
7665       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
7666                get_arg(ctx, ctx->args->ac.ancillary), Operand(8u), Operand(4u));
7667       break;
7668    }
7669    case nir_intrinsic_load_sample_mask_in: {
7670       visit_load_sample_mask_in(ctx, instr);
7671       break;
7672    }
7673    case nir_intrinsic_read_first_invocation: {
7674       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7675       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7676       if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
7677          emit_wqm(ctx,
7678                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
7679                   dst);
7680       } else if (src.regClass() == v2) {
7681          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7682          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7683          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
7684          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
7685          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7686          emit_split_vector(ctx, dst, 2);
7687       } else if (instr->dest.ssa.bit_size == 1) {
7688          assert(src.regClass() == bld.lm);
7689          Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src,
7690                              bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)));
7691          bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7692       } else if (src.regClass() == s1) {
7693          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
7694       } else if (src.regClass() == s2) {
7695          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
7696       } else {
7697          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7698          nir_print_instr(&instr->instr, stderr);
7699          fprintf(stderr, "\n");
7700       }
7701       break;
7702    }
7703    case nir_intrinsic_vote_all: {
7704       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7705       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7706       assert(src.regClass() == bld.lm);
7707       assert(dst.regClass() == bld.lm);
7708
7709       Temp tmp = bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src).def(1).getTemp();
7710       Temp cond = bool_to_vector_condition(ctx, emit_wqm(ctx, tmp));
7711       bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond);
7712       break;
7713    }
7714    case nir_intrinsic_vote_any: {
7715       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7716       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7717       assert(src.regClass() == bld.lm);
7718       assert(dst.regClass() == bld.lm);
7719
7720       Temp tmp = bool_to_scalar_condition(ctx, src);
7721       bool_to_vector_condition(ctx, emit_wqm(ctx, tmp), dst);
7722       break;
7723    }
7724    case nir_intrinsic_reduce:
7725    case nir_intrinsic_inclusive_scan:
7726    case nir_intrinsic_exclusive_scan: {
7727       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7728       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7729       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
7730       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
7731          nir_intrinsic_cluster_size(instr) : 0;
7732       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
7733
7734       if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
7735          emit_uniform_subgroup(ctx, instr, src);
7736       } else if (instr->dest.ssa.bit_size == 1) {
7737          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
7738             op = nir_op_iand;
7739          else if (op == nir_op_iadd)
7740             op = nir_op_ixor;
7741          else if (op == nir_op_umax || op == nir_op_imax)
7742             op = nir_op_ior;
7743          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
7744
7745          switch (instr->intrinsic) {
7746          case nir_intrinsic_reduce:
7747             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
7748             break;
7749          case nir_intrinsic_exclusive_scan:
7750             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
7751             break;
7752          case nir_intrinsic_inclusive_scan:
7753             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
7754             break;
7755          default:
7756             assert(false);
7757          }
7758       } else if (cluster_size == 1) {
7759          bld.copy(Definition(dst), src);
7760       } else {
7761          unsigned bit_size = instr->src[0].ssa->bit_size;
7762
7763          src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
7764
7765          ReduceOp reduce_op;
7766          switch (op) {
7767          #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
7768          #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
7769             CASEI(iadd)
7770             CASEI(imul)
7771             CASEI(imin)
7772             CASEI(umin)
7773             CASEI(imax)
7774             CASEI(umax)
7775             CASEI(iand)
7776             CASEI(ior)
7777             CASEI(ixor)
7778             CASEF(fadd)
7779             CASEF(fmul)
7780             CASEF(fmin)
7781             CASEF(fmax)
7782             default:
7783                unreachable("unknown reduction op");
7784          #undef CASEI
7785          #undef CASEF
7786          }
7787
7788          aco_opcode aco_op;
7789          switch (instr->intrinsic) {
7790             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
7791             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
7792             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
7793             default:
7794                unreachable("unknown reduce intrinsic");
7795          }
7796
7797          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
7798          reduce->operands[0] = Operand(src);
7799          // filled in by aco_reduce_assign.cpp, used internally as part of the
7800          // reduce sequence
7801          assert(dst.size() == 1 || dst.size() == 2);
7802          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
7803          reduce->operands[2] = Operand(v1.as_linear());
7804
7805          Temp tmp_dst = bld.tmp(dst.regClass());
7806          reduce->definitions[0] = Definition(tmp_dst);
7807          reduce->definitions[1] = bld.def(ctx->program->lane_mask); // used internally
7808          reduce->definitions[2] = Definition();
7809          reduce->definitions[3] = Definition(scc, s1);
7810          reduce->definitions[4] = Definition();
7811          reduce->reduce_op = reduce_op;
7812          reduce->cluster_size = cluster_size;
7813          ctx->block->instructions.emplace_back(std::move(reduce));
7814
7815          emit_wqm(ctx, tmp_dst, dst);
7816       }
7817       break;
7818    }
7819    case nir_intrinsic_quad_broadcast: {
7820       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7821       if (!nir_dest_is_divergent(instr->dest)) {
7822          emit_uniform_subgroup(ctx, instr, src);
7823       } else {
7824          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7825          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
7826          uint32_t dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane);
7827
7828          if (instr->dest.ssa.bit_size == 1) {
7829             assert(src.regClass() == bld.lm);
7830             assert(dst.regClass() == bld.lm);
7831             uint32_t half_mask = 0x11111111u << lane;
7832             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
7833             Temp tmp = bld.tmp(bld.lm);
7834             bld.sop1(Builder::s_wqm, Definition(tmp),
7835                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
7836                               bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
7837             emit_wqm(ctx, tmp, dst);
7838          } else if (instr->dest.ssa.bit_size == 8) {
7839             Temp tmp = bld.tmp(v1);
7840             if (ctx->program->chip_class >= GFX8)
7841                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7842             else
7843                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7844             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7845          } else if (instr->dest.ssa.bit_size == 16) {
7846             Temp tmp = bld.tmp(v1);
7847             if (ctx->program->chip_class >= GFX8)
7848                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7849             else
7850                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
7851             bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7852          } else if (instr->dest.ssa.bit_size == 32) {
7853             if (ctx->program->chip_class >= GFX8)
7854                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
7855             else
7856                emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), dst);
7857          } else if (instr->dest.ssa.bit_size == 64) {
7858             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7859             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7860             if (ctx->program->chip_class >= GFX8) {
7861                lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7862                hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7863             } else {
7864                lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl));
7865                hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl));
7866             }
7867             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7868             emit_split_vector(ctx, dst, 2);
7869          } else {
7870             fprintf(stderr, "Unimplemented NIR instr bit size: ");
7871             nir_print_instr(&instr->instr, stderr);
7872             fprintf(stderr, "\n");
7873          }
7874       }
7875       break;
7876    }
7877    case nir_intrinsic_quad_swap_horizontal:
7878    case nir_intrinsic_quad_swap_vertical:
7879    case nir_intrinsic_quad_swap_diagonal:
7880    case nir_intrinsic_quad_swizzle_amd: {
7881       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7882       if (!nir_dest_is_divergent(instr->dest)) {
7883          emit_uniform_subgroup(ctx, instr, src);
7884          break;
7885       }
7886       uint16_t dpp_ctrl = 0;
7887       switch (instr->intrinsic) {
7888       case nir_intrinsic_quad_swap_horizontal:
7889          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
7890          break;
7891       case nir_intrinsic_quad_swap_vertical:
7892          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
7893          break;
7894       case nir_intrinsic_quad_swap_diagonal:
7895          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
7896          break;
7897       case nir_intrinsic_quad_swizzle_amd:
7898          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
7899          break;
7900       default:
7901          break;
7902       }
7903       if (ctx->program->chip_class < GFX8)
7904          dpp_ctrl |= (1 << 15);
7905
7906       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7907       if (instr->dest.ssa.bit_size == 1) {
7908          assert(src.regClass() == bld.lm);
7909          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7910          if (ctx->program->chip_class >= GFX8)
7911             src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7912          else
7913             src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7914          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7915          emit_wqm(ctx, tmp, dst);
7916       } else if (instr->dest.ssa.bit_size == 8) {
7917          Temp tmp = bld.tmp(v1);
7918          if (ctx->program->chip_class >= GFX8)
7919             emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7920          else
7921             emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7922          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
7923       } else if (instr->dest.ssa.bit_size == 16) {
7924          Temp tmp = bld.tmp(v1);
7925          if (ctx->program->chip_class >= GFX8)
7926             emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
7927          else
7928             emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
7929          bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
7930       } else if (instr->dest.ssa.bit_size == 32) {
7931          Temp tmp;
7932          if (ctx->program->chip_class >= GFX8)
7933             tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
7934          else
7935             tmp = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
7936          emit_wqm(ctx, tmp, dst);
7937       } else if (instr->dest.ssa.bit_size == 64) {
7938          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7939          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7940          if (ctx->program->chip_class >= GFX8) {
7941             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
7942             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
7943          } else {
7944             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, dpp_ctrl));
7945             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, dpp_ctrl));
7946          }
7947          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7948          emit_split_vector(ctx, dst, 2);
7949       } else {
7950          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7951          nir_print_instr(&instr->instr, stderr);
7952          fprintf(stderr, "\n");
7953       }
7954       break;
7955    }
7956    case nir_intrinsic_masked_swizzle_amd: {
7957       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
7958       if (!nir_dest_is_divergent(instr->dest)) {
7959          emit_uniform_subgroup(ctx, instr, src);
7960          break;
7961       }
7962       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7963       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
7964       if (instr->dest.ssa.bit_size == 1) {
7965          assert(src.regClass() == bld.lm);
7966          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
7967          src = emit_masked_swizzle(ctx, bld, src, mask);
7968          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
7969          emit_wqm(ctx, tmp, dst);
7970       } else if (dst.regClass() == v1b) {
7971          Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
7972          emit_extract_vector(ctx, tmp, 0, dst);
7973       } else if (dst.regClass() == v2b) {
7974          Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
7975          emit_extract_vector(ctx, tmp, 0, dst);
7976       } else if (dst.regClass() == v1) {
7977          emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst);
7978       } else if (dst.regClass() == v2) {
7979          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
7980          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
7981          lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
7982          hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
7983          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
7984          emit_split_vector(ctx, dst, 2);
7985       } else {
7986          fprintf(stderr, "Unimplemented NIR instr bit size: ");
7987          nir_print_instr(&instr->instr, stderr);
7988          fprintf(stderr, "\n");
7989       }
7990       break;
7991    }
7992    case nir_intrinsic_write_invocation_amd: {
7993       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
7994       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
7995       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
7996       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
7997       if (dst.regClass() == v1) {
7998          /* src2 is ignored for writelane. RA assigns the same reg for dst */
7999          emit_wqm(ctx, bld.writelane(bld.def(v1), val, lane, src), dst);
8000       } else if (dst.regClass() == v2) {
8001          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
8002          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
8003          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
8004          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
8005          Temp lo = emit_wqm(ctx, bld.writelane(bld.def(v1), val_lo, lane, src_hi));
8006          Temp hi = emit_wqm(ctx, bld.writelane(bld.def(v1), val_hi, lane, src_hi));
8007          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
8008          emit_split_vector(ctx, dst, 2);
8009       } else {
8010          fprintf(stderr, "Unimplemented NIR instr bit size: ");
8011          nir_print_instr(&instr->instr, stderr);
8012          fprintf(stderr, "\n");
8013       }
8014       break;
8015    }
8016    case nir_intrinsic_mbcnt_amd: {
8017       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8018       RegClass rc = RegClass(src.type(), 1);
8019       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
8020       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
8021       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8022       Temp wqm_tmp = emit_mbcnt(ctx, bld.def(v1), Operand(mask_lo), Operand(mask_hi));
8023       emit_wqm(ctx, wqm_tmp, dst);
8024       break;
8025    }
8026    case nir_intrinsic_load_helper_invocation: {
8027       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8028       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
8029       ctx->block->kind |= block_kind_needs_lowering;
8030       ctx->program->needs_exact = true;
8031       break;
8032    }
8033    case nir_intrinsic_is_helper_invocation: {
8034       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8035       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
8036       ctx->block->kind |= block_kind_needs_lowering;
8037       ctx->program->needs_exact = true;
8038       break;
8039    }
8040    case nir_intrinsic_demote:
8041       bld.pseudo(aco_opcode::p_demote_to_helper, Operand(-1u));
8042
8043       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8044          ctx->cf_info.exec_potentially_empty_discard = true;
8045       ctx->block->kind |= block_kind_uses_demote;
8046       ctx->program->needs_exact = true;
8047       break;
8048    case nir_intrinsic_demote_if: {
8049       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
8050       assert(src.regClass() == bld.lm);
8051       Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm));
8052       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
8053
8054       if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
8055          ctx->cf_info.exec_potentially_empty_discard = true;
8056       ctx->block->kind |= block_kind_uses_demote;
8057       ctx->program->needs_exact = true;
8058       break;
8059    }
8060    case nir_intrinsic_first_invocation: {
8061       emit_wqm(ctx, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)),
8062                get_ssa_temp(ctx, &instr->dest.ssa));
8063       break;
8064    }
8065    case nir_intrinsic_shader_clock: {
8066       aco_opcode opcode =
8067          nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
8068             aco_opcode::s_memrealtime : aco_opcode::s_memtime;
8069       bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
8070       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
8071       break;
8072    }
8073    case nir_intrinsic_load_vertex_id_zero_base: {
8074       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8075       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
8076       break;
8077    }
8078    case nir_intrinsic_load_first_vertex: {
8079       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8080       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex));
8081       break;
8082    }
8083    case nir_intrinsic_load_base_instance: {
8084       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8085       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance));
8086       break;
8087    }
8088    case nir_intrinsic_load_instance_id: {
8089       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8090       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id));
8091       break;
8092    }
8093    case nir_intrinsic_load_draw_id: {
8094       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8095       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id));
8096       break;
8097    }
8098    case nir_intrinsic_load_invocation_id: {
8099       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8100
8101       if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
8102          if (ctx->options->chip_class >= GFX10)
8103             bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand(127u), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8104          else
8105             bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id));
8106       } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
8107          bld.vop3(aco_opcode::v_bfe_u32, Definition(dst),
8108                   get_arg(ctx, ctx->args->ac.tcs_rel_ids), Operand(8u), Operand(5u));
8109       } else {
8110          unreachable("Unsupported stage for load_invocation_id");
8111       }
8112
8113       break;
8114    }
8115    case nir_intrinsic_load_primitive_id: {
8116       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8117
8118       switch (ctx->shader->info.stage) {
8119       case MESA_SHADER_GEOMETRY:
8120          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id));
8121          break;
8122       case MESA_SHADER_TESS_CTRL:
8123          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id));
8124          break;
8125       case MESA_SHADER_TESS_EVAL:
8126          bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id));
8127          break;
8128       default:
8129          unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id");
8130       }
8131
8132       break;
8133    }
8134    case nir_intrinsic_load_patch_vertices_in: {
8135       assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL ||
8136              ctx->shader->info.stage == MESA_SHADER_TESS_EVAL);
8137
8138       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8139       bld.copy(Definition(dst), Operand(ctx->args->options->key.tcs.input_vertices));
8140       break;
8141    }
8142    case nir_intrinsic_emit_vertex_with_counter: {
8143       visit_emit_vertex_with_counter(ctx, instr);
8144       break;
8145    }
8146    case nir_intrinsic_end_primitive_with_counter: {
8147       unsigned stream = nir_intrinsic_stream_id(instr);
8148       bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(true, false, stream));
8149       break;
8150    }
8151    case nir_intrinsic_set_vertex_count: {
8152       /* unused, the HW keeps track of this for us */
8153       break;
8154    }
8155    default:
8156       fprintf(stderr, "Unimplemented intrinsic instr: ");
8157       nir_print_instr(&instr->instr, stderr);
8158       fprintf(stderr, "\n");
8159       abort();
8160
8161       break;
8162    }
8163 }
8164
8165
8166 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
8167                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
8168                     enum glsl_base_type *stype)
8169 {
8170    nir_deref_instr *texture_deref_instr = NULL;
8171    nir_deref_instr *sampler_deref_instr = NULL;
8172    int plane = -1;
8173
8174    for (unsigned i = 0; i < instr->num_srcs; i++) {
8175       switch (instr->src[i].src_type) {
8176       case nir_tex_src_texture_deref:
8177          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
8178          break;
8179       case nir_tex_src_sampler_deref:
8180          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
8181          break;
8182       case nir_tex_src_plane:
8183          plane = nir_src_as_int(instr->src[i].src);
8184          break;
8185       default:
8186          break;
8187       }
8188    }
8189
8190    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
8191
8192    if (!sampler_deref_instr)
8193       sampler_deref_instr = texture_deref_instr;
8194
8195    if (plane >= 0) {
8196       assert(instr->op != nir_texop_txf_ms &&
8197              instr->op != nir_texop_samples_identical);
8198       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
8199       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
8200    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
8201       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
8202    } else if (instr->op == nir_texop_fragment_mask_fetch) {
8203       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8204    } else {
8205       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
8206    }
8207    if (samp_ptr) {
8208       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
8209
8210       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
8211          /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */
8212          Builder bld(ctx->program, ctx->block);
8213
8214          /* to avoid unnecessary moves, we split and recombine sampler and image */
8215          Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1),
8216                         bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8217          Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)};
8218          bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]),
8219                     Definition(img[2]), Definition(img[3]), Definition(img[4]),
8220                     Definition(img[5]), Definition(img[6]), Definition(img[7]), *res_ptr);
8221          bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]),
8222                     Definition(samp[2]), Definition(samp[3]), *samp_ptr);
8223
8224          samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]);
8225          *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
8226                                img[0], img[1], img[2], img[3],
8227                                img[4], img[5], img[6], img[7]);
8228          *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
8229                                 samp[0], samp[1], samp[2], samp[3]);
8230       }
8231    }
8232    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
8233                      instr->op == nir_texop_samples_identical))
8234       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
8235 }
8236
8237 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
8238                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
8239 {
8240    Builder bld(ctx->program, ctx->block);
8241
8242    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
8243    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
8244    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
8245
8246    Operand neg_one(0xbf800000u);
8247    Operand one(0x3f800000u);
8248    Operand two(0x40000000u);
8249    Operand four(0x40800000u);
8250
8251    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), ma);
8252    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
8253    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
8254
8255    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id);
8256    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id);
8257    is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z);
8258    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), bld.def(s1, scc), is_ma_z, is_ma_y);
8259
8260    // select sc
8261    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
8262    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
8263                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
8264                        one, is_ma_y);
8265    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8266
8267    // select tc
8268    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
8269    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
8270    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
8271
8272    // select ma
8273    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8274                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
8275                   deriv_z, is_ma_z);
8276    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
8277    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
8278 }
8279
8280 void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
8281 {
8282    Builder bld(ctx->program, ctx->block);
8283    Temp ma, tc, sc, id;
8284
8285    if (is_array) {
8286       coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
8287
8288       // see comment in ac_prepare_cube_coords()
8289       if (ctx->options->chip_class <= GFX8)
8290          coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coords[3]);
8291    }
8292
8293    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8294
8295    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
8296    vop3a->operands[0] = Operand(ma);
8297    vop3a->abs[0] = true;
8298    Temp invma = bld.tmp(v1);
8299    vop3a->definitions[0] = Definition(invma);
8300    ctx->block->instructions.emplace_back(std::move(vop3a));
8301
8302    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8303    if (!is_deriv)
8304       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
8305
8306    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8307    if (!is_deriv)
8308       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
8309
8310    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
8311
8312    if (is_deriv) {
8313       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
8314       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
8315
8316       for (unsigned i = 0; i < 2; i++) {
8317          // see comment in ac_prepare_cube_coords()
8318          Temp deriv_ma;
8319          Temp deriv_sc, deriv_tc;
8320          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
8321                            &deriv_ma, &deriv_sc, &deriv_tc);
8322
8323          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
8324
8325          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8326                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
8327                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
8328          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
8329                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
8330                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
8331          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
8332       }
8333
8334       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
8335       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
8336    }
8337
8338    if (is_array)
8339       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
8340    coords.resize(3);
8341    coords[0] = sc;
8342    coords[1] = tc;
8343    coords[2] = id;
8344 }
8345
8346 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
8347 {
8348    if (vec->parent_instr->type != nir_instr_type_alu)
8349       return;
8350    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
8351    if (vec_instr->op != nir_op_vec(vec->num_components))
8352       return;
8353
8354    for (unsigned i = 0; i < vec->num_components; i++) {
8355       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
8356               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
8357    }
8358 }
8359
8360 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
8361 {
8362    Builder bld(ctx->program, ctx->block);
8363    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
8364         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
8365         has_clamped_lod = false;
8366    Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
8367         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
8368         clamped_lod = Temp();
8369    std::vector<Temp> coords;
8370    std::vector<Temp> derivs;
8371    nir_const_value *sample_index_cv = NULL;
8372    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
8373    enum glsl_base_type stype;
8374    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
8375
8376    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
8377                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
8378    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
8379                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
8380
8381    for (unsigned i = 0; i < instr->num_srcs; i++) {
8382       switch (instr->src[i].src_type) {
8383       case nir_tex_src_coord: {
8384          Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa);
8385          for (unsigned i = 0; i < coord.size(); i++)
8386             coords.emplace_back(emit_extract_vector(ctx, coord, i, v1));
8387          break;
8388       }
8389       case nir_tex_src_bias:
8390          bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
8391          has_bias = true;
8392          break;
8393       case nir_tex_src_lod: {
8394          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
8395
8396          if (val && val->f32 <= 0.0) {
8397             level_zero = true;
8398          } else {
8399             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8400             has_lod = true;
8401          }
8402          break;
8403       }
8404       case nir_tex_src_min_lod:
8405          clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
8406          has_clamped_lod = true;
8407          break;
8408       case nir_tex_src_comparator:
8409          if (instr->is_shadow) {
8410             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
8411             has_compare = true;
8412          }
8413          break;
8414       case nir_tex_src_offset:
8415          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
8416          get_const_vec(instr->src[i].src.ssa, const_offset);
8417          has_offset = true;
8418          break;
8419       case nir_tex_src_ddx:
8420          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
8421          has_ddx = true;
8422          break;
8423       case nir_tex_src_ddy:
8424          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
8425          has_ddy = true;
8426          break;
8427       case nir_tex_src_ms_index:
8428          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
8429          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
8430          has_sample_index = true;
8431          break;
8432       case nir_tex_src_texture_offset:
8433       case nir_tex_src_sampler_offset:
8434       default:
8435          break;
8436       }
8437    }
8438
8439    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
8440       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
8441
8442    if (instr->op == nir_texop_texture_samples) {
8443       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
8444
8445       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
8446       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
8447       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
8448
8449       Operand default_sample = Operand(1u);
8450       if (ctx->options->robust_buffer_access) {
8451          /* Extract the second dword of the descriptor, if it's
8452           * all zero, then it's a null descriptor.
8453           */
8454          Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
8455          Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
8456          default_sample = Operand(is_non_null_descriptor);
8457       }
8458
8459       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
8460       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
8461                samples, default_sample, bld.scc(is_msaa));
8462       return;
8463    }
8464
8465    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
8466       aco_ptr<Instruction> tmp_instr;
8467       Temp acc, pack = Temp();
8468
8469       uint32_t pack_const = 0;
8470       for (unsigned i = 0; i < offset.size(); i++) {
8471          if (!const_offset[i])
8472             continue;
8473          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
8474       }
8475
8476       if (offset.type() == RegType::sgpr) {
8477          for (unsigned i = 0; i < offset.size(); i++) {
8478             if (const_offset[i])
8479                continue;
8480
8481             acc = emit_extract_vector(ctx, offset, i, s1);
8482             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
8483
8484             if (i) {
8485                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
8486             }
8487
8488             if (pack == Temp()) {
8489                pack = acc;
8490             } else {
8491                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
8492             }
8493          }
8494
8495          if (pack_const && pack != Temp())
8496             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
8497       } else {
8498          for (unsigned i = 0; i < offset.size(); i++) {
8499             if (const_offset[i])
8500                continue;
8501
8502             acc = emit_extract_vector(ctx, offset, i, v1);
8503             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
8504
8505             if (i) {
8506                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
8507             }
8508
8509             if (pack == Temp()) {
8510                pack = acc;
8511             } else {
8512                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
8513             }
8514          }
8515
8516          if (pack_const && pack != Temp())
8517             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
8518       }
8519       if (pack_const && pack == Temp())
8520          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
8521       else if (pack == Temp())
8522          has_offset = false;
8523       else
8524          offset = pack;
8525    }
8526
8527    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
8528       prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
8529
8530    /* pack derivatives */
8531    if (has_ddx || has_ddy) {
8532       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
8533          assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1);
8534          Temp zero = bld.copy(bld.def(v1), Operand(0u));
8535          derivs = {ddx, zero, ddy, zero};
8536       } else {
8537          for (unsigned i = 0; has_ddx && i < ddx.size(); i++)
8538             derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1));
8539          for (unsigned i = 0; has_ddy && i < ddy.size(); i++)
8540             derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1));
8541       }
8542       has_derivs = true;
8543    }
8544
8545    if (instr->coord_components > 1 &&
8546        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8547        instr->is_array &&
8548        instr->op != nir_texop_txf)
8549       coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]);
8550
8551    if (instr->coord_components > 2 &&
8552       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
8553        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8554        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
8555        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8556        instr->is_array &&
8557        instr->op != nir_texop_txf &&
8558        instr->op != nir_texop_txf_ms &&
8559        instr->op != nir_texop_fragment_fetch &&
8560        instr->op != nir_texop_fragment_mask_fetch)
8561       coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]);
8562
8563    if (ctx->options->chip_class == GFX9 &&
8564        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8565        instr->op != nir_texop_lod && instr->coord_components) {
8566       assert(coords.size() > 0 && coords.size() < 3);
8567
8568       coords.insert(std::next(coords.begin()), bld.copy(bld.def(v1), instr->op == nir_texop_txf ?
8569                                                                      Operand((uint32_t) 0) :
8570                                                                      Operand((uint32_t) 0x3f000000)));
8571    }
8572
8573    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
8574
8575    if (instr->op == nir_texop_samples_identical)
8576       resource = fmask_ptr;
8577
8578    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
8579              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
8580             instr->op != nir_texop_txs &&
8581             instr->op != nir_texop_fragment_fetch &&
8582             instr->op != nir_texop_fragment_mask_fetch) {
8583       assert(has_sample_index);
8584       Operand op(sample_index);
8585       if (sample_index_cv)
8586          op = Operand(sample_index_cv->u32);
8587       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
8588    }
8589
8590    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
8591       for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) {
8592          Temp off = emit_extract_vector(ctx, offset, i, v1);
8593          coords[i] = bld.vadd32(bld.def(v1), coords[i], off);
8594       }
8595       has_offset = false;
8596    }
8597
8598    /* Build tex instruction */
8599    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
8600    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
8601                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
8602                   : 0;
8603    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
8604    Temp tmp_dst = dst;
8605
8606    /* gather4 selects the component by dmask and always returns vec4 */
8607    if (instr->op == nir_texop_tg4) {
8608       assert(instr->dest.ssa.num_components == 4);
8609       if (instr->is_shadow)
8610          dmask = 1;
8611       else
8612          dmask = 1 << instr->component;
8613       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
8614          tmp_dst = bld.tmp(v4);
8615    } else if (instr->op == nir_texop_samples_identical) {
8616       tmp_dst = bld.tmp(v1);
8617    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
8618       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
8619    }
8620
8621    aco_ptr<MIMG_instruction> tex;
8622    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
8623       if (!has_lod)
8624          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8625
8626       bool div_by_6 = instr->op == nir_texop_txs &&
8627                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
8628                       instr->is_array &&
8629                       (dmask & (1 << 2));
8630       if (tmp_dst.id() == dst.id() && div_by_6)
8631          tmp_dst = bld.tmp(tmp_dst.regClass());
8632
8633       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8634       tex->operands[0] = Operand(resource);
8635       tex->operands[1] = Operand(s4); /* no sampler */
8636       tex->operands[2] = Operand(as_vgpr(ctx,lod));
8637       if (ctx->options->chip_class == GFX9 &&
8638           instr->op == nir_texop_txs &&
8639           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
8640           instr->is_array) {
8641          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
8642       } else if (instr->op == nir_texop_query_levels) {
8643          tex->dmask = 1 << 3;
8644       } else {
8645          tex->dmask = dmask;
8646       }
8647       tex->da = da;
8648       tex->definitions[0] = Definition(tmp_dst);
8649       tex->dim = dim;
8650       tex->can_reorder = true;
8651       ctx->block->instructions.emplace_back(std::move(tex));
8652
8653       if (div_by_6) {
8654          /* divide 3rd value by 6 by multiplying with magic number */
8655          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8656          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
8657          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
8658          assert(instr->dest.ssa.num_components == 3);
8659          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
8660          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8661                               emit_extract_vector(ctx, tmp_dst, 0, v1),
8662                               emit_extract_vector(ctx, tmp_dst, 1, v1),
8663                               by_6);
8664
8665       }
8666
8667       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8668       return;
8669    }
8670
8671    Temp tg4_compare_cube_wa64 = Temp();
8672
8673    if (tg4_integer_workarounds) {
8674       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 3, 1));
8675       tex->operands[0] = Operand(resource);
8676       tex->operands[1] = Operand(s4); /* no sampler */
8677       tex->operands[2] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
8678       tex->dim = dim;
8679       tex->dmask = 0x3;
8680       tex->da = da;
8681       Temp size = bld.tmp(v2);
8682       tex->definitions[0] = Definition(size);
8683       tex->can_reorder = true;
8684       ctx->block->instructions.emplace_back(std::move(tex));
8685       emit_split_vector(ctx, size, size.size());
8686
8687       Temp half_texel[2];
8688       for (unsigned i = 0; i < 2; i++) {
8689          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
8690          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
8691          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
8692          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
8693       }
8694
8695       Temp new_coords[2] = {
8696          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]),
8697          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])
8698       };
8699
8700       if (tg4_integer_cube_workaround) {
8701          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
8702          Temp desc[resource.size()];
8703          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
8704                                                                            Format::PSEUDO, 1, resource.size())};
8705          split->operands[0] = Operand(resource);
8706          for (unsigned i = 0; i < resource.size(); i++) {
8707             desc[i] = bld.tmp(s1);
8708             split->definitions[i] = Definition(desc[i]);
8709          }
8710          ctx->block->instructions.emplace_back(std::move(split));
8711
8712          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
8713          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
8714                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
8715
8716          Temp nfmt;
8717          if (stype == GLSL_TYPE_UINT) {
8718             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8719                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
8720                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
8721                             bld.scc(compare_cube_wa));
8722          } else {
8723             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
8724                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
8725                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
8726                             bld.scc(compare_cube_wa));
8727          }
8728          tg4_compare_cube_wa64 = bld.tmp(bld.lm);
8729          bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64);
8730
8731          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
8732
8733          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
8734                             Operand((uint32_t)C_008F14_NUM_FORMAT));
8735          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
8736
8737          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
8738                                                                          Format::PSEUDO, resource.size(), 1)};
8739          for (unsigned i = 0; i < resource.size(); i++)
8740             vec->operands[i] = Operand(desc[i]);
8741          resource = bld.tmp(resource.regClass());
8742          vec->definitions[0] = Definition(resource);
8743          ctx->block->instructions.emplace_back(std::move(vec));
8744
8745          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8746                                   new_coords[0], coords[0], tg4_compare_cube_wa64);
8747          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
8748                                   new_coords[1], coords[1], tg4_compare_cube_wa64);
8749       }
8750       coords[0] = new_coords[0];
8751       coords[1] = new_coords[1];
8752    }
8753
8754    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
8755       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
8756
8757       assert(coords.size() == 1);
8758       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
8759       aco_opcode op;
8760       switch (last_bit) {
8761       case 1:
8762          op = aco_opcode::buffer_load_format_x; break;
8763       case 2:
8764          op = aco_opcode::buffer_load_format_xy; break;
8765       case 3:
8766          op = aco_opcode::buffer_load_format_xyz; break;
8767       case 4:
8768          op = aco_opcode::buffer_load_format_xyzw; break;
8769       default:
8770          unreachable("Tex instruction loads more than 4 components.");
8771       }
8772
8773       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
8774       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
8775          tmp_dst = dst;
8776       else
8777          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
8778
8779       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
8780       mubuf->operands[0] = Operand(resource);
8781       mubuf->operands[1] = Operand(coords[0]);
8782       mubuf->operands[2] = Operand((uint32_t) 0);
8783       mubuf->definitions[0] = Definition(tmp_dst);
8784       mubuf->idxen = true;
8785       mubuf->can_reorder = true;
8786       ctx->block->instructions.emplace_back(std::move(mubuf));
8787
8788       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
8789       return;
8790    }
8791
8792    /* gather MIMG address components */
8793    std::vector<Temp> args;
8794    if (has_offset)
8795       args.emplace_back(offset);
8796    if (has_bias)
8797       args.emplace_back(bias);
8798    if (has_compare)
8799       args.emplace_back(compare);
8800    if (has_derivs)
8801       args.insert(args.end(), derivs.begin(), derivs.end());
8802
8803    args.insert(args.end(), coords.begin(), coords.end());
8804    if (has_sample_index)
8805       args.emplace_back(sample_index);
8806    if (has_lod)
8807       args.emplace_back(lod);
8808    if (has_clamped_lod)
8809       args.emplace_back(clamped_lod);
8810
8811    Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
8812    aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
8813    vec->definitions[0] = Definition(arg);
8814    for (unsigned i = 0; i < args.size(); i++)
8815       vec->operands[i] = Operand(args[i]);
8816    ctx->block->instructions.emplace_back(std::move(vec));
8817
8818
8819    if (instr->op == nir_texop_txf ||
8820        instr->op == nir_texop_txf_ms ||
8821        instr->op == nir_texop_samples_identical ||
8822        instr->op == nir_texop_fragment_fetch ||
8823        instr->op == nir_texop_fragment_mask_fetch) {
8824       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
8825       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 3, 1));
8826       tex->operands[0] = Operand(resource);
8827       tex->operands[1] = Operand(s4); /* no sampler */
8828       tex->operands[2] = Operand(arg);
8829       tex->dim = dim;
8830       tex->dmask = dmask;
8831       tex->unrm = true;
8832       tex->da = da;
8833       tex->definitions[0] = Definition(tmp_dst);
8834       tex->can_reorder = true;
8835       ctx->block->instructions.emplace_back(std::move(tex));
8836
8837       if (instr->op == nir_texop_samples_identical) {
8838          assert(dmask == 1 && dst.regClass() == v1);
8839          assert(dst.id() != tmp_dst.id());
8840
8841          Temp tmp = bld.tmp(bld.lm);
8842          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
8843          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
8844
8845       } else {
8846          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
8847       }
8848       return;
8849    }
8850
8851    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
8852    aco_opcode opcode = aco_opcode::image_sample;
8853    if (has_offset) { /* image_sample_*_o */
8854       if (has_clamped_lod) {
8855          if (has_compare) {
8856             opcode = aco_opcode::image_sample_c_cl_o;
8857             if (has_derivs)
8858                opcode = aco_opcode::image_sample_c_d_cl_o;
8859             if (has_bias)
8860                opcode = aco_opcode::image_sample_c_b_cl_o;
8861          } else {
8862             opcode = aco_opcode::image_sample_cl_o;
8863             if (has_derivs)
8864                opcode = aco_opcode::image_sample_d_cl_o;
8865             if (has_bias)
8866                opcode = aco_opcode::image_sample_b_cl_o;
8867          }
8868       } else if (has_compare) {
8869          opcode = aco_opcode::image_sample_c_o;
8870          if (has_derivs)
8871             opcode = aco_opcode::image_sample_c_d_o;
8872          if (has_bias)
8873             opcode = aco_opcode::image_sample_c_b_o;
8874          if (level_zero)
8875             opcode = aco_opcode::image_sample_c_lz_o;
8876          if (has_lod)
8877             opcode = aco_opcode::image_sample_c_l_o;
8878       } else {
8879          opcode = aco_opcode::image_sample_o;
8880          if (has_derivs)
8881             opcode = aco_opcode::image_sample_d_o;
8882          if (has_bias)
8883             opcode = aco_opcode::image_sample_b_o;
8884          if (level_zero)
8885             opcode = aco_opcode::image_sample_lz_o;
8886          if (has_lod)
8887             opcode = aco_opcode::image_sample_l_o;
8888       }
8889    } else if (has_clamped_lod) { /* image_sample_*_cl */
8890       if (has_compare) {
8891          opcode = aco_opcode::image_sample_c_cl;
8892          if (has_derivs)
8893             opcode = aco_opcode::image_sample_c_d_cl;
8894          if (has_bias)
8895             opcode = aco_opcode::image_sample_c_b_cl;
8896       } else {
8897          opcode = aco_opcode::image_sample_cl;
8898          if (has_derivs)
8899             opcode = aco_opcode::image_sample_d_cl;
8900          if (has_bias)
8901             opcode = aco_opcode::image_sample_b_cl;
8902       }
8903    } else { /* no offset */
8904       if (has_compare) {
8905          opcode = aco_opcode::image_sample_c;
8906          if (has_derivs)
8907             opcode = aco_opcode::image_sample_c_d;
8908          if (has_bias)
8909             opcode = aco_opcode::image_sample_c_b;
8910          if (level_zero)
8911             opcode = aco_opcode::image_sample_c_lz;
8912          if (has_lod)
8913             opcode = aco_opcode::image_sample_c_l;
8914       } else {
8915          opcode = aco_opcode::image_sample;
8916          if (has_derivs)
8917             opcode = aco_opcode::image_sample_d;
8918          if (has_bias)
8919             opcode = aco_opcode::image_sample_b;
8920          if (level_zero)
8921             opcode = aco_opcode::image_sample_lz;
8922          if (has_lod)
8923             opcode = aco_opcode::image_sample_l;
8924       }
8925    }
8926
8927    if (instr->op == nir_texop_tg4) {
8928       if (has_offset) { /* image_gather4_*_o */
8929          if (has_compare) {
8930             opcode = aco_opcode::image_gather4_c_lz_o;
8931             if (has_lod)
8932                opcode = aco_opcode::image_gather4_c_l_o;
8933             if (has_bias)
8934                opcode = aco_opcode::image_gather4_c_b_o;
8935          } else {
8936             opcode = aco_opcode::image_gather4_lz_o;
8937             if (has_lod)
8938                opcode = aco_opcode::image_gather4_l_o;
8939             if (has_bias)
8940                opcode = aco_opcode::image_gather4_b_o;
8941          }
8942       } else {
8943          if (has_compare) {
8944             opcode = aco_opcode::image_gather4_c_lz;
8945             if (has_lod)
8946                opcode = aco_opcode::image_gather4_c_l;
8947             if (has_bias)
8948                opcode = aco_opcode::image_gather4_c_b;
8949          } else {
8950             opcode = aco_opcode::image_gather4_lz;
8951             if (has_lod)
8952                opcode = aco_opcode::image_gather4_l;
8953             if (has_bias)
8954                opcode = aco_opcode::image_gather4_b;
8955          }
8956       }
8957    } else if (instr->op == nir_texop_lod) {
8958       opcode = aco_opcode::image_get_lod;
8959    }
8960
8961    /* we don't need the bias, sample index, compare value or offset to be
8962     * computed in WQM but if the p_create_vector copies the coordinates, then it
8963     * needs to be in WQM */
8964    if (ctx->stage == fragment_fs &&
8965        !has_derivs && !has_lod && !level_zero &&
8966        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
8967        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
8968       arg = emit_wqm(ctx, arg, bld.tmp(arg.regClass()), true);
8969
8970    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
8971    tex->operands[0] = Operand(resource);
8972    tex->operands[1] = Operand(sampler);
8973    tex->operands[2] = Operand(arg);
8974    tex->dim = dim;
8975    tex->dmask = dmask;
8976    tex->da = da;
8977    tex->definitions[0] = Definition(tmp_dst);
8978    tex->can_reorder = true;
8979    ctx->block->instructions.emplace_back(std::move(tex));
8980
8981    if (tg4_integer_cube_workaround) {
8982       assert(tmp_dst.id() != dst.id());
8983       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
8984
8985       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
8986       Temp val[4];
8987       for (unsigned i = 0; i < dst.size(); i++) {
8988          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
8989          Temp cvt_val;
8990          if (stype == GLSL_TYPE_UINT)
8991             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
8992          else
8993             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
8994          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
8995       }
8996       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
8997       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
8998                            val[0], val[1], val[2], val[3]);
8999    }
9000    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
9001    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
9002
9003 }
9004
9005
9006 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
9007 {
9008    Temp tmp = get_ssa_temp(ctx, ssa);
9009    if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
9010       return Operand(rc);
9011    } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
9012       if (ctx->program->wave_size == 64)
9013          return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
9014       else
9015          return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
9016    } else {
9017       return Operand(tmp);
9018    }
9019 }
9020
9021 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
9022 {
9023    aco_ptr<Pseudo_instruction> phi;
9024    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
9025    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
9026
9027    bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
9028    logical |= ctx->block->kind & block_kind_merge;
9029    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
9030
9031    /* we want a sorted list of sources, since the predecessor list is also sorted */
9032    std::map<unsigned, nir_ssa_def*> phi_src;
9033    nir_foreach_phi_src(src, instr)
9034       phi_src[src->pred->index] = src->src.ssa;
9035
9036    std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds;
9037    unsigned num_operands = 0;
9038    Operand operands[std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1];
9039    unsigned num_defined = 0;
9040    unsigned cur_pred_idx = 0;
9041    for (std::pair<unsigned, nir_ssa_def *> src : phi_src) {
9042       if (cur_pred_idx < preds.size()) {
9043          /* handle missing preds (IF merges with discard/break) and extra preds (loop exit with discard) */
9044          unsigned block = ctx->cf_info.nir_to_aco[src.first];
9045          unsigned skipped = 0;
9046          while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block)
9047             skipped++;
9048          if (cur_pred_idx + skipped < preds.size()) {
9049             for (unsigned i = 0; i < skipped; i++)
9050                operands[num_operands++] = Operand(dst.regClass());
9051             cur_pred_idx += skipped;
9052          } else {
9053             continue;
9054          }
9055       }
9056       /* Handle missing predecessors at the end. This shouldn't happen with loop
9057        * headers and we can't ignore these sources for loop header phis. */
9058       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
9059          continue;
9060       cur_pred_idx++;
9061       Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
9062       operands[num_operands++] = op;
9063       num_defined += !op.isUndefined();
9064    }
9065    /* handle block_kind_continue_or_break at loop exit blocks */
9066    while (cur_pred_idx++ < preds.size())
9067       operands[num_operands++] = Operand(dst.regClass());
9068
9069    /* If the loop ends with a break, still add a linear continue edge in case
9070     * that break is divergent or continue_or_break is used. We'll either remove
9071     * this operand later in visit_loop() if it's not necessary or replace the
9072     * undef with something correct. */
9073    if (!logical && ctx->block->kind & block_kind_loop_header) {
9074       nir_loop *loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent);
9075       nir_block *last = nir_loop_last_block(loop);
9076       if (last->successors[0] != instr->instr.block)
9077          operands[num_operands++] = Operand(RegClass());
9078    }
9079
9080    if (num_defined == 0) {
9081       Builder bld(ctx->program, ctx->block);
9082       if (dst.regClass() == s1) {
9083          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
9084       } else if (dst.regClass() == v1) {
9085          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
9086       } else {
9087          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9088          for (unsigned i = 0; i < dst.size(); i++)
9089             vec->operands[i] = Operand(0u);
9090          vec->definitions[0] = Definition(dst);
9091          ctx->block->instructions.emplace_back(std::move(vec));
9092       }
9093       return;
9094    }
9095
9096    /* we can use a linear phi in some cases if one src is undef */
9097    if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) {
9098       phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, num_operands, 1));
9099
9100       Block *linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]];
9101       Block *invert = &ctx->program->blocks[linear_else->linear_preds[0]];
9102       assert(invert->kind & block_kind_invert);
9103
9104       unsigned then_block = invert->linear_preds[0];
9105
9106       Block* insert_block = NULL;
9107       for (unsigned i = 0; i < num_operands; i++) {
9108          Operand op = operands[i];
9109          if (op.isUndefined())
9110             continue;
9111          insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block;
9112          phi->operands[0] = op;
9113          break;
9114       }
9115       assert(insert_block); /* should be handled by the "num_defined == 0" case above */
9116       phi->operands[1] = Operand(dst.regClass());
9117       phi->definitions[0] = Definition(dst);
9118       insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi));
9119       return;
9120    }
9121
9122    /* try to scalarize vector phis */
9123    if (instr->dest.ssa.bit_size != 1 && dst.size() > 1) {
9124       // TODO: scalarize linear phis on divergent ifs
9125       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
9126       std::array<Temp, NIR_MAX_VEC_COMPONENTS> new_vec;
9127       for (unsigned i = 0; can_scalarize && (i < num_operands); i++) {
9128          Operand src = operands[i];
9129          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end())
9130             can_scalarize = false;
9131       }
9132       if (can_scalarize) {
9133          unsigned num_components = instr->dest.ssa.num_components;
9134          assert(dst.size() % num_components == 0);
9135          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
9136
9137          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
9138          for (unsigned k = 0; k < num_components; k++) {
9139             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9140             for (unsigned i = 0; i < num_operands; i++) {
9141                Operand src = operands[i];
9142                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
9143             }
9144             Temp phi_dst = {ctx->program->allocateId(), rc};
9145             phi->definitions[0] = Definition(phi_dst);
9146             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9147             new_vec[k] = phi_dst;
9148             vec->operands[k] = Operand(phi_dst);
9149          }
9150          vec->definitions[0] = Definition(dst);
9151          ctx->block->instructions.emplace_back(std::move(vec));
9152          ctx->allocated_vec.emplace(dst.id(), new_vec);
9153          return;
9154       }
9155    }
9156
9157    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1));
9158    for (unsigned i = 0; i < num_operands; i++)
9159       phi->operands[i] = operands[i];
9160    phi->definitions[0] = Definition(dst);
9161    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
9162 }
9163
9164
9165 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
9166 {
9167    Temp dst = get_ssa_temp(ctx, &instr->def);
9168
9169    assert(dst.type() == RegType::sgpr);
9170
9171    if (dst.size() == 1) {
9172       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
9173    } else {
9174       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
9175       for (unsigned i = 0; i < dst.size(); i++)
9176          vec->operands[i] = Operand(0u);
9177       vec->definitions[0] = Definition(dst);
9178       ctx->block->instructions.emplace_back(std::move(vec));
9179    }
9180 }
9181
9182 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
9183 {
9184    Builder bld(ctx->program, ctx->block);
9185    Block *logical_target;
9186    append_logical_end(ctx->block);
9187    unsigned idx = ctx->block->index;
9188
9189    switch (instr->type) {
9190    case nir_jump_break:
9191       logical_target = ctx->cf_info.parent_loop.exit;
9192       add_logical_edge(idx, logical_target);
9193       ctx->block->kind |= block_kind_break;
9194
9195       if (!ctx->cf_info.parent_if.is_divergent &&
9196           !ctx->cf_info.parent_loop.has_divergent_continue) {
9197          /* uniform break - directly jump out of the loop */
9198          ctx->block->kind |= block_kind_uniform;
9199          ctx->cf_info.has_branch = true;
9200          bld.branch(aco_opcode::p_branch);
9201          add_linear_edge(idx, logical_target);
9202          return;
9203       }
9204       ctx->cf_info.parent_loop.has_divergent_branch = true;
9205       ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9206       break;
9207    case nir_jump_continue:
9208       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9209       add_logical_edge(idx, logical_target);
9210       ctx->block->kind |= block_kind_continue;
9211
9212       if (ctx->cf_info.parent_if.is_divergent) {
9213          /* for potential uniform breaks after this continue,
9214             we must ensure that they are handled correctly */
9215          ctx->cf_info.parent_loop.has_divergent_continue = true;
9216          ctx->cf_info.parent_loop.has_divergent_branch = true;
9217          ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index;
9218       } else {
9219          /* uniform continue - directly jump to the loop header */
9220          ctx->block->kind |= block_kind_uniform;
9221          ctx->cf_info.has_branch = true;
9222          bld.branch(aco_opcode::p_branch);
9223          add_linear_edge(idx, logical_target);
9224          return;
9225       }
9226       break;
9227    default:
9228       fprintf(stderr, "Unknown NIR jump instr: ");
9229       nir_print_instr(&instr->instr, stderr);
9230       fprintf(stderr, "\n");
9231       abort();
9232    }
9233
9234    if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) {
9235       ctx->cf_info.exec_potentially_empty_break = true;
9236       ctx->cf_info.exec_potentially_empty_break_depth = ctx->cf_info.loop_nest_depth;
9237    }
9238
9239    /* remove critical edges from linear CFG */
9240    bld.branch(aco_opcode::p_branch);
9241    Block* break_block = ctx->program->create_and_insert_block();
9242    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9243    break_block->kind |= block_kind_uniform;
9244    add_linear_edge(idx, break_block);
9245    /* the loop_header pointer might be invalidated by this point */
9246    if (instr->type == nir_jump_continue)
9247       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
9248    add_linear_edge(break_block->index, logical_target);
9249    bld.reset(break_block);
9250    bld.branch(aco_opcode::p_branch);
9251
9252    Block* continue_block = ctx->program->create_and_insert_block();
9253    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9254    add_linear_edge(idx, continue_block);
9255    append_logical_start(continue_block);
9256    ctx->block = continue_block;
9257    return;
9258 }
9259
9260 void visit_block(isel_context *ctx, nir_block *block)
9261 {
9262    nir_foreach_instr(instr, block) {
9263       switch (instr->type) {
9264       case nir_instr_type_alu:
9265          visit_alu_instr(ctx, nir_instr_as_alu(instr));
9266          break;
9267       case nir_instr_type_load_const:
9268          visit_load_const(ctx, nir_instr_as_load_const(instr));
9269          break;
9270       case nir_instr_type_intrinsic:
9271          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
9272          break;
9273       case nir_instr_type_tex:
9274          visit_tex(ctx, nir_instr_as_tex(instr));
9275          break;
9276       case nir_instr_type_phi:
9277          visit_phi(ctx, nir_instr_as_phi(instr));
9278          break;
9279       case nir_instr_type_ssa_undef:
9280          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
9281          break;
9282       case nir_instr_type_deref:
9283          break;
9284       case nir_instr_type_jump:
9285          visit_jump(ctx, nir_instr_as_jump(instr));
9286          break;
9287       default:
9288          fprintf(stderr, "Unknown NIR instr type: ");
9289          nir_print_instr(instr, stderr);
9290          fprintf(stderr, "\n");
9291          //abort();
9292       }
9293    }
9294
9295    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9296       ctx->cf_info.nir_to_aco[block->index] = ctx->block->index;
9297 }
9298
9299
9300
9301 static Operand create_continue_phis(isel_context *ctx, unsigned first, unsigned last,
9302                                     aco_ptr<Instruction>& header_phi, Operand *vals)
9303 {
9304    vals[0] = Operand(header_phi->definitions[0].getTemp());
9305    RegClass rc = vals[0].regClass();
9306
9307    unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth;
9308
9309    unsigned next_pred = 1;
9310
9311    for (unsigned idx = first + 1; idx <= last; idx++) {
9312       Block& block = ctx->program->blocks[idx];
9313       if (block.loop_nest_depth != loop_nest_depth) {
9314          vals[idx - first] = vals[idx - 1 - first];
9315          continue;
9316       }
9317
9318       if (block.kind & block_kind_continue) {
9319          vals[idx - first] = header_phi->operands[next_pred];
9320          next_pred++;
9321          continue;
9322       }
9323
9324       bool all_same = true;
9325       for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++)
9326          all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first];
9327
9328       Operand val;
9329       if (all_same) {
9330          val = vals[block.linear_preds[0] - first];
9331       } else {
9332          aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>(
9333             aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1));
9334          for (unsigned i = 0; i < block.linear_preds.size(); i++)
9335             phi->operands[i] = vals[block.linear_preds[i] - first];
9336          val = Operand(Temp(ctx->program->allocateId(), rc));
9337          phi->definitions[0] = Definition(val.getTemp());
9338          block.instructions.emplace(block.instructions.begin(), std::move(phi));
9339       }
9340       vals[idx - first] = val;
9341    }
9342
9343    return vals[last - first];
9344 }
9345
9346 static void visit_loop(isel_context *ctx, nir_loop *loop)
9347 {
9348    //TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true
9349    append_logical_end(ctx->block);
9350    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
9351    Builder bld(ctx->program, ctx->block);
9352    bld.branch(aco_opcode::p_branch);
9353    unsigned loop_preheader_idx = ctx->block->index;
9354
9355    Block loop_exit = Block();
9356    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9357    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
9358
9359    Block* loop_header = ctx->program->create_and_insert_block();
9360    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
9361    loop_header->kind |= block_kind_loop_header;
9362    add_edge(loop_preheader_idx, loop_header);
9363    ctx->block = loop_header;
9364
9365    /* emit loop body */
9366    unsigned loop_header_idx = loop_header->index;
9367    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
9368    append_logical_start(ctx->block);
9369    bool unreachable = visit_cf_list(ctx, &loop->body);
9370
9371    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
9372    if (!ctx->cf_info.has_branch) {
9373       append_logical_end(ctx->block);
9374       if (ctx->cf_info.exec_potentially_empty_discard || ctx->cf_info.exec_potentially_empty_break) {
9375          /* Discards can result in code running with an empty exec mask.
9376           * This would result in divergent breaks not ever being taken. As a
9377           * workaround, break the loop when the loop mask is empty instead of
9378           * always continuing. */
9379          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
9380          unsigned block_idx = ctx->block->index;
9381
9382          /* create helper blocks to avoid critical edges */
9383          Block *break_block = ctx->program->create_and_insert_block();
9384          break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9385          break_block->kind = block_kind_uniform;
9386          bld.reset(break_block);
9387          bld.branch(aco_opcode::p_branch);
9388          add_linear_edge(block_idx, break_block);
9389          add_linear_edge(break_block->index, &loop_exit);
9390
9391          Block *continue_block = ctx->program->create_and_insert_block();
9392          continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9393          continue_block->kind = block_kind_uniform;
9394          bld.reset(continue_block);
9395          bld.branch(aco_opcode::p_branch);
9396          add_linear_edge(block_idx, continue_block);
9397          add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]);
9398
9399          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9400             add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]);
9401          ctx->block = &ctx->program->blocks[block_idx];
9402       } else {
9403          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
9404          if (!ctx->cf_info.parent_loop.has_divergent_branch)
9405             add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9406          else
9407             add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
9408       }
9409
9410       bld.reset(ctx->block);
9411       bld.branch(aco_opcode::p_branch);
9412    }
9413
9414    /* Fixup phis in loop header from unreachable blocks.
9415     * has_branch/has_divergent_branch also indicates if the loop ends with a
9416     * break/continue instruction, but we don't emit those if unreachable=true */
9417    if (unreachable) {
9418       assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch);
9419       bool linear = ctx->cf_info.has_branch;
9420       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
9421       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9422          if ((logical && instr->opcode == aco_opcode::p_phi) ||
9423              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
9424             /* the last operand should be the one that needs to be removed */
9425             instr->operands.pop_back();
9426          } else if (!is_phi(instr)) {
9427             break;
9428          }
9429       }
9430    }
9431
9432    /* Fixup linear phis in loop header from expecting a continue. Both this fixup
9433     * and the previous one shouldn't both happen at once because a break in the
9434     * merge block would get CSE'd */
9435    if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) {
9436       unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1);
9437       Operand vals[num_vals];
9438       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
9439          if (instr->opcode == aco_opcode::p_linear_phi) {
9440             if (ctx->cf_info.has_branch)
9441                instr->operands.pop_back();
9442             else
9443                instr->operands.back() = create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals);
9444          } else if (!is_phi(instr)) {
9445             break;
9446          }
9447       }
9448    }
9449
9450    ctx->cf_info.has_branch = false;
9451
9452    // TODO: if the loop has not a single exit, we must add one °°
9453    /* emit loop successor block */
9454    ctx->block = ctx->program->insert_block(std::move(loop_exit));
9455    append_logical_start(ctx->block);
9456
9457    #if 0
9458    // TODO: check if it is beneficial to not branch on continues
9459    /* trim linear phis in loop header */
9460    for (auto&& instr : loop_entry->instructions) {
9461       if (instr->opcode == aco_opcode::p_linear_phi) {
9462          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
9463          new_phi->definitions[0] = instr->definitions[0];
9464          for (unsigned i = 0; i < new_phi->operands.size(); i++)
9465             new_phi->operands[i] = instr->operands[i];
9466          /* check that the remaining operands are all the same */
9467          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
9468             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
9469          instr.swap(new_phi);
9470       } else if (instr->opcode == aco_opcode::p_phi) {
9471          continue;
9472       } else {
9473          break;
9474       }
9475    }
9476    #endif
9477 }
9478
9479 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
9480 {
9481    ic->cond = cond;
9482
9483    append_logical_end(ctx->block);
9484    ctx->block->kind |= block_kind_branch;
9485
9486    /* branch to linear then block */
9487    assert(cond.regClass() == ctx->program->lane_mask);
9488    aco_ptr<Pseudo_branch_instruction> branch;
9489    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
9490    branch->operands[0] = Operand(cond);
9491    ctx->block->instructions.push_back(std::move(branch));
9492
9493    ic->BB_if_idx = ctx->block->index;
9494    ic->BB_invert = Block();
9495    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9496    /* Invert blocks are intentionally not marked as top level because they
9497     * are not part of the logical cfg. */
9498    ic->BB_invert.kind |= block_kind_invert;
9499    ic->BB_endif = Block();
9500    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9501    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
9502
9503    ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard;
9504    ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break;
9505    ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth;
9506    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
9507    ctx->cf_info.parent_if.is_divergent = true;
9508
9509    /* divergent branches use cbranch_execz */
9510    ctx->cf_info.exec_potentially_empty_discard = false;
9511    ctx->cf_info.exec_potentially_empty_break = false;
9512    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9513
9514    /** emit logical then block */
9515    Block* BB_then_logical = ctx->program->create_and_insert_block();
9516    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9517    add_edge(ic->BB_if_idx, BB_then_logical);
9518    ctx->block = BB_then_logical;
9519    append_logical_start(BB_then_logical);
9520 }
9521
9522 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
9523 {
9524    Block *BB_then_logical = ctx->block;
9525    append_logical_end(BB_then_logical);
9526     /* branch from logical then block to invert block */
9527    aco_ptr<Pseudo_branch_instruction> branch;
9528    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9529    BB_then_logical->instructions.emplace_back(std::move(branch));
9530    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
9531    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9532       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
9533    BB_then_logical->kind |= block_kind_uniform;
9534    assert(!ctx->cf_info.has_branch);
9535    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9536    ctx->cf_info.parent_loop.has_divergent_branch = false;
9537
9538    /** emit linear then block */
9539    Block* BB_then_linear = ctx->program->create_and_insert_block();
9540    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9541    BB_then_linear->kind |= block_kind_uniform;
9542    add_linear_edge(ic->BB_if_idx, BB_then_linear);
9543    /* branch from linear then block to invert block */
9544    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9545    BB_then_linear->instructions.emplace_back(std::move(branch));
9546    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
9547
9548    /** emit invert merge block */
9549    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
9550    ic->invert_idx = ctx->block->index;
9551
9552    /* branch to linear else block (skip else) */
9553    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
9554    branch->operands[0] = Operand(ic->cond);
9555    ctx->block->instructions.push_back(std::move(branch));
9556
9557    ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard;
9558    ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break;
9559    ic->exec_potentially_empty_break_depth_old =
9560       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9561    /* divergent branches use cbranch_execz */
9562    ctx->cf_info.exec_potentially_empty_discard = false;
9563    ctx->cf_info.exec_potentially_empty_break = false;
9564    ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9565
9566    /** emit logical else block */
9567    Block* BB_else_logical = ctx->program->create_and_insert_block();
9568    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9569    add_logical_edge(ic->BB_if_idx, BB_else_logical);
9570    add_linear_edge(ic->invert_idx, BB_else_logical);
9571    ctx->block = BB_else_logical;
9572    append_logical_start(BB_else_logical);
9573 }
9574
9575 static void end_divergent_if(isel_context *ctx, if_context *ic)
9576 {
9577    Block *BB_else_logical = ctx->block;
9578    append_logical_end(BB_else_logical);
9579
9580    /* branch from logical else block to endif block */
9581    aco_ptr<Pseudo_branch_instruction> branch;
9582    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9583    BB_else_logical->instructions.emplace_back(std::move(branch));
9584    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
9585    if (!ctx->cf_info.parent_loop.has_divergent_branch)
9586       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
9587    BB_else_logical->kind |= block_kind_uniform;
9588
9589    assert(!ctx->cf_info.has_branch);
9590    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9591
9592
9593    /** emit linear else block */
9594    Block* BB_else_linear = ctx->program->create_and_insert_block();
9595    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9596    BB_else_linear->kind |= block_kind_uniform;
9597    add_linear_edge(ic->invert_idx, BB_else_linear);
9598
9599    /* branch from linear else block to endif block */
9600    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9601    BB_else_linear->instructions.emplace_back(std::move(branch));
9602    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
9603
9604
9605    /** emit endif merge block */
9606    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9607    append_logical_start(ctx->block);
9608
9609
9610    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
9611    ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old;
9612    ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old;
9613    ctx->cf_info.exec_potentially_empty_break_depth =
9614       std::min(ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth);
9615    if (ctx->cf_info.loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth &&
9616        !ctx->cf_info.parent_if.is_divergent) {
9617       ctx->cf_info.exec_potentially_empty_break = false;
9618       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9619    }
9620    /* uniform control flow never has an empty exec-mask */
9621    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) {
9622       ctx->cf_info.exec_potentially_empty_discard = false;
9623       ctx->cf_info.exec_potentially_empty_break = false;
9624       ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX;
9625    }
9626 }
9627
9628 static void begin_uniform_if_then(isel_context *ctx, if_context *ic, Temp cond)
9629 {
9630    assert(cond.regClass() == s1);
9631
9632    append_logical_end(ctx->block);
9633    ctx->block->kind |= block_kind_uniform;
9634
9635    aco_ptr<Pseudo_branch_instruction> branch;
9636    aco_opcode branch_opcode = aco_opcode::p_cbranch_z;
9637    branch.reset(create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 0));
9638    branch->operands[0] = Operand(cond);
9639    branch->operands[0].setFixed(scc);
9640    ctx->block->instructions.emplace_back(std::move(branch));
9641
9642    ic->BB_if_idx = ctx->block->index;
9643    ic->BB_endif = Block();
9644    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
9645    ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level;
9646
9647    ctx->cf_info.has_branch = false;
9648    ctx->cf_info.parent_loop.has_divergent_branch = false;
9649
9650    /** emit then block */
9651    Block* BB_then = ctx->program->create_and_insert_block();
9652    BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9653    add_edge(ic->BB_if_idx, BB_then);
9654    append_logical_start(BB_then);
9655    ctx->block = BB_then;
9656 }
9657
9658 static void begin_uniform_if_else(isel_context *ctx, if_context *ic)
9659 {
9660    Block *BB_then = ctx->block;
9661
9662    ic->uniform_has_then_branch = ctx->cf_info.has_branch;
9663    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
9664
9665    if (!ic->uniform_has_then_branch) {
9666       append_logical_end(BB_then);
9667       /* branch from then block to endif block */
9668       aco_ptr<Pseudo_branch_instruction> branch;
9669       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9670       BB_then->instructions.emplace_back(std::move(branch));
9671       add_linear_edge(BB_then->index, &ic->BB_endif);
9672       if (!ic->then_branch_divergent)
9673          add_logical_edge(BB_then->index, &ic->BB_endif);
9674       BB_then->kind |= block_kind_uniform;
9675    }
9676
9677    ctx->cf_info.has_branch = false;
9678    ctx->cf_info.parent_loop.has_divergent_branch = false;
9679
9680    /** emit else block */
9681    Block* BB_else = ctx->program->create_and_insert_block();
9682    BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
9683    add_edge(ic->BB_if_idx, BB_else);
9684    append_logical_start(BB_else);
9685    ctx->block = BB_else;
9686 }
9687
9688 static void end_uniform_if(isel_context *ctx, if_context *ic)
9689 {
9690    Block *BB_else = ctx->block;
9691
9692    if (!ctx->cf_info.has_branch) {
9693       append_logical_end(BB_else);
9694       /* branch from then block to endif block */
9695       aco_ptr<Pseudo_branch_instruction> branch;
9696       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
9697       BB_else->instructions.emplace_back(std::move(branch));
9698       add_linear_edge(BB_else->index, &ic->BB_endif);
9699       if (!ctx->cf_info.parent_loop.has_divergent_branch)
9700          add_logical_edge(BB_else->index, &ic->BB_endif);
9701       BB_else->kind |= block_kind_uniform;
9702    }
9703
9704    ctx->cf_info.has_branch &= ic->uniform_has_then_branch;
9705    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
9706
9707    /** emit endif merge block */
9708    if (!ctx->cf_info.has_branch) {
9709       ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
9710       append_logical_start(ctx->block);
9711    }
9712 }
9713
9714 static bool visit_if(isel_context *ctx, nir_if *if_stmt)
9715 {
9716    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
9717    Builder bld(ctx->program, ctx->block);
9718    aco_ptr<Pseudo_branch_instruction> branch;
9719    if_context ic;
9720
9721    if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
9722       /**
9723        * Uniform conditionals are represented in the following way*) :
9724        *
9725        * The linear and logical CFG:
9726        *                        BB_IF
9727        *                        /    \
9728        *       BB_THEN (logical)      BB_ELSE (logical)
9729        *                        \    /
9730        *                        BB_ENDIF
9731        *
9732        * *) Exceptions may be due to break and continue statements within loops
9733        *    If a break/continue happens within uniform control flow, it branches
9734        *    to the loop exit/entry block. Otherwise, it branches to the next
9735        *    merge block.
9736        **/
9737
9738       // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
9739       assert(cond.regClass() == ctx->program->lane_mask);
9740       cond = bool_to_scalar_condition(ctx, cond);
9741
9742       begin_uniform_if_then(ctx, &ic, cond);
9743       visit_cf_list(ctx, &if_stmt->then_list);
9744
9745       begin_uniform_if_else(ctx, &ic);
9746       visit_cf_list(ctx, &if_stmt->else_list);
9747
9748       end_uniform_if(ctx, &ic);
9749    } else { /* non-uniform condition */
9750       /**
9751        * To maintain a logical and linear CFG without critical edges,
9752        * non-uniform conditionals are represented in the following way*) :
9753        *
9754        * The linear CFG:
9755        *                        BB_IF
9756        *                        /    \
9757        *       BB_THEN (logical)      BB_THEN (linear)
9758        *                        \    /
9759        *                        BB_INVERT (linear)
9760        *                        /    \
9761        *       BB_ELSE (logical)      BB_ELSE (linear)
9762        *                        \    /
9763        *                        BB_ENDIF
9764        *
9765        * The logical CFG:
9766        *                        BB_IF
9767        *                        /    \
9768        *       BB_THEN (logical)      BB_ELSE (logical)
9769        *                        \    /
9770        *                        BB_ENDIF
9771        *
9772        * *) Exceptions may be due to break and continue statements within loops
9773        **/
9774
9775       begin_divergent_if_then(ctx, &ic, cond);
9776       visit_cf_list(ctx, &if_stmt->then_list);
9777
9778       begin_divergent_if_else(ctx, &ic);
9779       visit_cf_list(ctx, &if_stmt->else_list);
9780
9781       end_divergent_if(ctx, &ic);
9782    }
9783
9784    return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
9785 }
9786
9787 static bool visit_cf_list(isel_context *ctx,
9788                           struct exec_list *list)
9789 {
9790    foreach_list_typed(nir_cf_node, node, node, list) {
9791       switch (node->type) {
9792       case nir_cf_node_block:
9793          visit_block(ctx, nir_cf_node_as_block(node));
9794          break;
9795       case nir_cf_node_if:
9796          if (!visit_if(ctx, nir_cf_node_as_if(node)))
9797             return true;
9798          break;
9799       case nir_cf_node_loop:
9800          visit_loop(ctx, nir_cf_node_as_loop(node));
9801          break;
9802       default:
9803          unreachable("unimplemented cf list type");
9804       }
9805    }
9806    return false;
9807 }
9808
9809 static void create_null_export(isel_context *ctx)
9810 {
9811    /* Some shader stages always need to have exports.
9812     * So when there is none, we need to add a null export.
9813     */
9814
9815    unsigned dest = (ctx->program->stage & hw_fs) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS;
9816    bool vm = (ctx->program->stage & hw_fs) || ctx->program->chip_class >= GFX10;
9817    Builder bld(ctx->program, ctx->block);
9818    bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
9819            /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, vm);
9820 }
9821
9822 static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
9823 {
9824    assert(ctx->stage == vertex_vs ||
9825           ctx->stage == tess_eval_vs ||
9826           ctx->stage == gs_copy_vs ||
9827           ctx->stage == ngg_vertex_gs ||
9828           ctx->stage == ngg_tess_eval_gs);
9829
9830    int offset = (ctx->stage & sw_tes)
9831                 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot]
9832                 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
9833    uint64_t mask = ctx->outputs.mask[slot];
9834    if (!is_pos && !mask)
9835       return false;
9836    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
9837       return false;
9838    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9839    exp->enabled_mask = mask;
9840    for (unsigned i = 0; i < 4; ++i) {
9841       if (mask & (1 << i))
9842          exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
9843       else
9844          exp->operands[i] = Operand(v1);
9845    }
9846    /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
9847     * Setting valid_mask=1 prevents it and has no other effect.
9848     */
9849    exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
9850    exp->done = false;
9851    exp->compressed = false;
9852    if (is_pos)
9853       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9854    else
9855       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
9856    ctx->block->instructions.emplace_back(std::move(exp));
9857
9858    return true;
9859 }
9860
9861 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
9862 {
9863    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
9864    exp->enabled_mask = 0;
9865    for (unsigned i = 0; i < 4; ++i)
9866       exp->operands[i] = Operand(v1);
9867    if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) {
9868       exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]);
9869       exp->enabled_mask |= 0x1;
9870    }
9871    if (ctx->outputs.mask[VARYING_SLOT_LAYER]) {
9872       exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]);
9873       exp->enabled_mask |= 0x4;
9874    }
9875    if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) {
9876       if (ctx->options->chip_class < GFX9) {
9877          exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]);
9878          exp->enabled_mask |= 0x8;
9879       } else {
9880          Builder bld(ctx->program, ctx->block);
9881
9882          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
9883                              Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]));
9884          if (exp->operands[2].isTemp())
9885             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
9886
9887          exp->operands[2] = Operand(out);
9888          exp->enabled_mask |= 0x4;
9889       }
9890    }
9891    exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
9892    exp->done = false;
9893    exp->compressed = false;
9894    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
9895    ctx->block->instructions.emplace_back(std::move(exp));
9896 }
9897
9898 static void create_export_phis(isel_context *ctx)
9899 {
9900    /* Used when exports are needed, but the output temps are defined in a preceding block.
9901     * This function will set up phis in order to access the outputs in the next block.
9902     */
9903
9904    assert(ctx->block->instructions.back()->opcode == aco_opcode::p_logical_start);
9905    aco_ptr<Instruction> logical_start = aco_ptr<Instruction>(ctx->block->instructions.back().release());
9906    ctx->block->instructions.pop_back();
9907
9908    Builder bld(ctx->program, ctx->block);
9909
9910    for (unsigned slot = 0; slot <= VARYING_SLOT_VAR31; ++slot) {
9911       uint64_t mask = ctx->outputs.mask[slot];
9912       for (unsigned i = 0; i < 4; ++i) {
9913          if (!(mask & (1 << i)))
9914             continue;
9915
9916          Temp old = ctx->outputs.temps[slot * 4 + i];
9917          Temp phi = bld.pseudo(aco_opcode::p_phi, bld.def(v1), old, Operand(v1));
9918          ctx->outputs.temps[slot * 4 + i] = phi;
9919       }
9920    }
9921
9922    bld.insert(std::move(logical_start));
9923 }
9924
9925 static void create_vs_exports(isel_context *ctx)
9926 {
9927    assert(ctx->stage == vertex_vs ||
9928           ctx->stage == tess_eval_vs ||
9929           ctx->stage == gs_copy_vs ||
9930           ctx->stage == ngg_vertex_gs ||
9931           ctx->stage == ngg_tess_eval_gs);
9932
9933    radv_vs_output_info *outinfo = (ctx->stage & sw_tes)
9934                                   ? &ctx->program->info->tes.outinfo
9935                                   : &ctx->program->info->vs.outinfo;
9936
9937    if (outinfo->export_prim_id && !(ctx->stage & hw_ngg_gs)) {
9938       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
9939       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = get_arg(ctx, ctx->args->vs_prim_id);
9940    }
9941
9942    if (ctx->options->key.has_multiview_view_index) {
9943       ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1;
9944       ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index));
9945    }
9946
9947    /* the order these position exports are created is important */
9948    int next_pos = 0;
9949    bool exported_pos = export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
9950    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
9951       export_vs_psiz_layer_viewport(ctx, &next_pos);
9952       exported_pos = true;
9953    }
9954    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9955       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
9956    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9957       exported_pos |= export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
9958
9959    if (ctx->export_clip_dists) {
9960       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
9961          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
9962       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
9963          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
9964    }
9965
9966    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
9967       if (i < VARYING_SLOT_VAR0 &&
9968           i != VARYING_SLOT_LAYER &&
9969           i != VARYING_SLOT_PRIMITIVE_ID &&
9970           i != VARYING_SLOT_VIEWPORT)
9971          continue;
9972
9973       export_vs_varying(ctx, i, false, NULL);
9974    }
9975
9976    if (!exported_pos)
9977       create_null_export(ctx);
9978 }
9979
9980 static bool export_fs_mrt_z(isel_context *ctx)
9981 {
9982    Builder bld(ctx->program, ctx->block);
9983    unsigned enabled_channels = 0;
9984    bool compr = false;
9985    Operand values[4];
9986
9987    for (unsigned i = 0; i < 4; ++i) {
9988       values[i] = Operand(v1);
9989    }
9990
9991    /* Both stencil and sample mask only need 16-bits. */
9992    if (!ctx->program->info->ps.writes_z &&
9993        (ctx->program->info->ps.writes_stencil ||
9994         ctx->program->info->ps.writes_sample_mask)) {
9995       compr = true; /* COMPR flag */
9996
9997       if (ctx->program->info->ps.writes_stencil) {
9998          /* Stencil should be in X[23:16]. */
9999          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10000          values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u), values[0]);
10001          enabled_channels |= 0x3;
10002       }
10003
10004       if (ctx->program->info->ps.writes_sample_mask) {
10005          /* SampleMask should be in Y[15:0]. */
10006          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10007          enabled_channels |= 0xc;
10008      }
10009    } else {
10010       if (ctx->program->info->ps.writes_z) {
10011          values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]);
10012          enabled_channels |= 0x1;
10013       }
10014
10015       if (ctx->program->info->ps.writes_stencil) {
10016          values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]);
10017          enabled_channels |= 0x2;
10018       }
10019
10020       if (ctx->program->info->ps.writes_sample_mask) {
10021          values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]);
10022          enabled_channels |= 0x4;
10023       }
10024    }
10025
10026    /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X
10027     * writemask component.
10028     */
10029    if (ctx->options->chip_class == GFX6 &&
10030        ctx->options->family != CHIP_OLAND &&
10031        ctx->options->family != CHIP_HAINAN) {
10032             enabled_channels |= 0x1;
10033    }
10034
10035    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
10036            enabled_channels, V_008DFC_SQ_EXP_MRTZ, compr);
10037
10038    return true;
10039 }
10040
10041 static bool export_fs_mrt_color(isel_context *ctx, int slot)
10042 {
10043    Builder bld(ctx->program, ctx->block);
10044    unsigned write_mask = ctx->outputs.mask[slot];
10045    Operand values[4];
10046
10047    for (unsigned i = 0; i < 4; ++i) {
10048       if (write_mask & (1 << i)) {
10049          values[i] = Operand(ctx->outputs.temps[slot * 4u + i]);
10050       } else {
10051          values[i] = Operand(v1);
10052       }
10053    }
10054
10055    unsigned target, col_format;
10056    unsigned enabled_channels = 0;
10057    aco_opcode compr_op = (aco_opcode)0;
10058
10059    slot -= FRAG_RESULT_DATA0;
10060    target = V_008DFC_SQ_EXP_MRT + slot;
10061    col_format = (ctx->options->key.fs.col_format >> (4 * slot)) & 0xf;
10062
10063    bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
10064    bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
10065    bool is_16bit = values[0].regClass() == v2b;
10066
10067    switch (col_format)
10068    {
10069    case V_028714_SPI_SHADER_ZERO:
10070       enabled_channels = 0; /* writemask */
10071       target = V_008DFC_SQ_EXP_NULL;
10072       break;
10073
10074    case V_028714_SPI_SHADER_32_R:
10075       enabled_channels = 1;
10076       break;
10077
10078    case V_028714_SPI_SHADER_32_GR:
10079       enabled_channels = 0x3;
10080       break;
10081
10082    case V_028714_SPI_SHADER_32_AR:
10083       if (ctx->options->chip_class >= GFX10) {
10084          /* Special case: on GFX10, the outputs are different for 32_AR */
10085          enabled_channels = 0x3;
10086          values[1] = values[3];
10087          values[3] = Operand(v1);
10088       } else {
10089          enabled_channels = 0x9;
10090       }
10091       break;
10092
10093    case V_028714_SPI_SHADER_FP16_ABGR:
10094       enabled_channels = 0x5;
10095       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
10096       if (is_16bit) {
10097          if (ctx->options->chip_class >= GFX9) {
10098             /* Pack the FP16 values together instead of converting them to
10099              * FP32 and back to FP16.
10100              * TODO: use p_create_vector and let the compiler optimizes.
10101              */
10102             compr_op = aco_opcode::v_pack_b32_f16;
10103          } else {
10104             for (unsigned i = 0; i < 4; i++) {
10105                if ((write_mask >> i) & 1)
10106                   values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
10107             }
10108          }
10109       }
10110       break;
10111
10112    case V_028714_SPI_SHADER_UNORM16_ABGR:
10113       enabled_channels = 0x5;
10114       if (is_16bit && ctx->options->chip_class >= GFX9) {
10115          compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
10116       } else {
10117          compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
10118       }
10119       break;
10120
10121    case V_028714_SPI_SHADER_SNORM16_ABGR:
10122       enabled_channels = 0x5;
10123       if (is_16bit && ctx->options->chip_class >= GFX9) {
10124          compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
10125       } else {
10126          compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
10127       }
10128       break;
10129
10130    case V_028714_SPI_SHADER_UINT16_ABGR: {
10131       enabled_channels = 0x5;
10132       compr_op = aco_opcode::v_cvt_pk_u16_u32;
10133       if (is_int8 || is_int10) {
10134          /* clamp */
10135          uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0;
10136          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10137
10138          for (unsigned i = 0; i < 4; i++) {
10139             if ((write_mask >> i) & 1) {
10140                values[i] = bld.vop2(aco_opcode::v_min_u32, bld.def(v1),
10141                                     i == 3 && is_int10 ? Operand(3u) : Operand(max_rgb_val),
10142                                     values[i]);
10143             }
10144          }
10145       } else if (is_16bit) {
10146          for (unsigned i = 0; i < 4; i++) {
10147             if ((write_mask >> i) & 1) {
10148                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
10149                values[i] = Operand(tmp);
10150             }
10151          }
10152       }
10153       break;
10154    }
10155
10156    case V_028714_SPI_SHADER_SINT16_ABGR:
10157       enabled_channels = 0x5;
10158       compr_op = aco_opcode::v_cvt_pk_i16_i32;
10159       if (is_int8 || is_int10) {
10160          /* clamp */
10161          uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0;
10162          uint32_t min_rgb = is_int8 ? -128 :is_int10 ? -512 : 0;
10163          Temp max_rgb_val = bld.copy(bld.def(s1), Operand(max_rgb));
10164          Temp min_rgb_val = bld.copy(bld.def(s1), Operand(min_rgb));
10165
10166          for (unsigned i = 0; i < 4; i++) {
10167             if ((write_mask >> i) & 1) {
10168                values[i] = bld.vop2(aco_opcode::v_min_i32, bld.def(v1),
10169                                     i == 3 && is_int10 ? Operand(1u) : Operand(max_rgb_val),
10170                                     values[i]);
10171                values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1),
10172                                     i == 3 && is_int10 ? Operand(-2u) : Operand(min_rgb_val),
10173                                     values[i]);
10174             }
10175          }
10176       } else if (is_16bit) {
10177          for (unsigned i = 0; i < 4; i++) {
10178             if ((write_mask >> i) & 1) {
10179                Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
10180                values[i] = Operand(tmp);
10181             }
10182          }
10183       }
10184       break;
10185
10186    case V_028714_SPI_SHADER_32_ABGR:
10187       enabled_channels = 0xF;
10188       break;
10189
10190    default:
10191       break;
10192    }
10193
10194    if (target == V_008DFC_SQ_EXP_NULL)
10195       return false;
10196
10197    /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
10198    if (ctx->options->enable_mrt_output_nan_fixup &&
10199        !is_16bit &&
10200        (col_format == V_028714_SPI_SHADER_32_R ||
10201         col_format == V_028714_SPI_SHADER_32_GR ||
10202         col_format == V_028714_SPI_SHADER_32_AR ||
10203         col_format == V_028714_SPI_SHADER_32_ABGR ||
10204         col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
10205       for (int i = 0; i < 4; i++) {
10206          if (!(write_mask & (1 << i)))
10207             continue;
10208
10209          Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
10210                                bld.hint_vcc(bld.def(bld.lm)), values[i],
10211                                bld.copy(bld.def(v1), Operand(3u)));
10212          values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
10213                               bld.copy(bld.def(v1), Operand(0u)), isnan);
10214       }
10215    }
10216
10217    if ((bool) compr_op) {
10218       for (int i = 0; i < 2; i++) {
10219          /* check if at least one of the values to be compressed is enabled */
10220          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
10221          if (enabled) {
10222             enabled_channels |= enabled << (i*2);
10223             values[i] = bld.vop3(compr_op, bld.def(v1),
10224                                  values[i*2].isUndefined() ? Operand(0u) : values[i*2],
10225                                  values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1]);
10226          } else {
10227             values[i] = Operand(v1);
10228          }
10229       }
10230       values[2] = Operand(v1);
10231       values[3] = Operand(v1);
10232    } else {
10233       for (int i = 0; i < 4; i++)
10234          values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
10235    }
10236
10237    bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3],
10238            enabled_channels, target, (bool) compr_op);
10239    return true;
10240 }
10241
10242 static void create_fs_exports(isel_context *ctx)
10243 {
10244    bool exported = false;
10245
10246    /* Export depth, stencil and sample mask. */
10247    if (ctx->outputs.mask[FRAG_RESULT_DEPTH] ||
10248        ctx->outputs.mask[FRAG_RESULT_STENCIL] ||
10249        ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK])
10250       exported |= export_fs_mrt_z(ctx);
10251
10252    /* Export all color render targets. */
10253    for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i)
10254       if (ctx->outputs.mask[i])
10255          exported |= export_fs_mrt_color(ctx, i);
10256
10257    if (!exported)
10258       create_null_export(ctx);
10259 }
10260
10261 static void write_tcs_tess_factors(isel_context *ctx)
10262 {
10263    unsigned outer_comps;
10264    unsigned inner_comps;
10265
10266    switch (ctx->args->options->key.tcs.primitive_mode) {
10267    case GL_ISOLINES:
10268       outer_comps = 2;
10269       inner_comps = 0;
10270       break;
10271    case GL_TRIANGLES:
10272       outer_comps = 3;
10273       inner_comps = 1;
10274       break;
10275    case GL_QUADS:
10276       outer_comps = 4;
10277       inner_comps = 2;
10278       break;
10279    default:
10280       return;
10281    }
10282
10283    Builder bld(ctx->program, ctx->block);
10284
10285    bld.barrier(aco_opcode::p_memory_barrier_shared);
10286    if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
10287       bld.sopp(aco_opcode::s_barrier);
10288
10289    Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
10290    Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
10291
10292    Temp invocation_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), invocation_id);
10293    if_context ic_invocation_id_is_zero;
10294    begin_divergent_if_then(ctx, &ic_invocation_id_is_zero, invocation_id_is_zero);
10295    bld.reset(ctx->block);
10296
10297    Temp hs_ring_tess_factor = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_FACTOR * 16u));
10298
10299    std::pair<Temp, unsigned> lds_base = get_tcs_output_lds_offset(ctx);
10300    unsigned stride = inner_comps + outer_comps;
10301    unsigned lds_align = calculate_lds_alignment(ctx, lds_base.second);
10302    Temp tf_inner_vec;
10303    Temp tf_outer_vec;
10304    Temp out[6];
10305    assert(stride <= (sizeof(out) / sizeof(Temp)));
10306
10307    if (ctx->args->options->key.tcs.primitive_mode == GL_ISOLINES) {
10308       // LINES reversal
10309       tf_outer_vec = load_lds(ctx, 4, bld.tmp(v2), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10310       out[1] = emit_extract_vector(ctx, tf_outer_vec, 0, v1);
10311       out[0] = emit_extract_vector(ctx, tf_outer_vec, 1, v1);
10312    } else {
10313       tf_outer_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, outer_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_out_loc, lds_align);
10314       tf_inner_vec = load_lds(ctx, 4, bld.tmp(RegClass(RegType::vgpr, inner_comps)), lds_base.first, lds_base.second + ctx->tcs_tess_lvl_in_loc, lds_align);
10315
10316       for (unsigned i = 0; i < outer_comps; ++i)
10317          out[i] = emit_extract_vector(ctx, tf_outer_vec, i, v1);
10318       for (unsigned i = 0; i < inner_comps; ++i)
10319          out[outer_comps + i] = emit_extract_vector(ctx, tf_inner_vec, i, v1);
10320    }
10321
10322    Temp rel_patch_id = get_tess_rel_patch_id(ctx);
10323    Temp tf_base = get_arg(ctx, ctx->args->tess_factor_offset);
10324    Temp byte_offset = bld.v_mul24_imm(bld.def(v1), rel_patch_id, stride * 4u);
10325    unsigned tf_const_offset = 0;
10326
10327    if (ctx->program->chip_class <= GFX8) {
10328       Temp rel_patch_id_is_zero = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), rel_patch_id);
10329       if_context ic_rel_patch_id_is_zero;
10330       begin_divergent_if_then(ctx, &ic_rel_patch_id_is_zero, rel_patch_id_is_zero);
10331       bld.reset(ctx->block);
10332
10333       /* Store the dynamic HS control word. */
10334       Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
10335       bld.mubuf(aco_opcode::buffer_store_dword,
10336                 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
10337                 /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
10338                 /* disable_wqm */ false, /* glc */ true);
10339       tf_const_offset += 4;
10340
10341       begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
10342       end_divergent_if(ctx, &ic_rel_patch_id_is_zero);
10343       bld.reset(ctx->block);
10344    }
10345
10346    assert(stride == 2 || stride == 4 || stride == 6);
10347    Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
10348    store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
10349
10350    /* Store to offchip for TES to read - only if TES reads them */
10351    if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
10352       Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
10353       Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
10354
10355       std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
10356       store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
10357
10358       if (likely(inner_comps)) {
10359          std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
10360          store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
10361       }
10362    }
10363
10364    begin_divergent_if_else(ctx, &ic_invocation_id_is_zero);
10365    end_divergent_if(ctx, &ic_invocation_id_is_zero);
10366 }
10367
10368 static void emit_stream_output(isel_context *ctx,
10369                                Temp const *so_buffers,
10370                                Temp const *so_write_offset,
10371                                const struct radv_stream_output *output)
10372 {
10373    unsigned num_comps = util_bitcount(output->component_mask);
10374    unsigned writemask = (1 << num_comps) - 1;
10375    unsigned loc = output->location;
10376    unsigned buf = output->buffer;
10377
10378    assert(num_comps && num_comps <= 4);
10379    if (!num_comps || num_comps > 4)
10380       return;
10381
10382    unsigned start = ffs(output->component_mask) - 1;
10383
10384    Temp out[4];
10385    bool all_undef = true;
10386    assert(ctx->stage & hw_vs);
10387    for (unsigned i = 0; i < num_comps; i++) {
10388       out[i] = ctx->outputs.temps[loc * 4 + start + i];
10389       all_undef = all_undef && !out[i].id();
10390    }
10391    if (all_undef)
10392       return;
10393
10394    while (writemask) {
10395       int start, count;
10396       u_bit_scan_consecutive_range(&writemask, &start, &count);
10397       if (count == 3 && ctx->options->chip_class == GFX6) {
10398          /* GFX6 doesn't support storing vec3, split it. */
10399          writemask |= 1u << (start + 2);
10400          count = 2;
10401       }
10402
10403       unsigned offset = output->offset + start * 4;
10404
10405       Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, count)};
10406       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
10407       for (int i = 0; i < count; ++i)
10408          vec->operands[i] = (ctx->outputs.mask[loc] & 1 << (start + i)) ? Operand(out[start + i]) : Operand(0u);
10409       vec->definitions[0] = Definition(write_data);
10410       ctx->block->instructions.emplace_back(std::move(vec));
10411
10412       aco_opcode opcode;
10413       switch (count) {
10414       case 1:
10415          opcode = aco_opcode::buffer_store_dword;
10416          break;
10417       case 2:
10418          opcode = aco_opcode::buffer_store_dwordx2;
10419          break;
10420       case 3:
10421          opcode = aco_opcode::buffer_store_dwordx3;
10422          break;
10423       case 4:
10424          opcode = aco_opcode::buffer_store_dwordx4;
10425          break;
10426       default:
10427          unreachable("Unsupported dword count.");
10428       }
10429
10430       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
10431       store->operands[0] = Operand(so_buffers[buf]);
10432       store->operands[1] = Operand(so_write_offset[buf]);
10433       store->operands[2] = Operand((uint32_t) 0);
10434       store->operands[3] = Operand(write_data);
10435       if (offset > 4095) {
10436          /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
10437          Builder bld(ctx->program, ctx->block);
10438          store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
10439       } else {
10440          store->offset = offset;
10441       }
10442       store->offen = true;
10443       store->glc = true;
10444       store->dlc = false;
10445       store->slc = true;
10446       store->can_reorder = true;
10447       ctx->block->instructions.emplace_back(std::move(store));
10448    }
10449 }
10450
10451 static void emit_streamout(isel_context *ctx, unsigned stream)
10452 {
10453    Builder bld(ctx->program, ctx->block);
10454
10455    Temp so_buffers[4];
10456    Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers));
10457    for (unsigned i = 0; i < 4; i++) {
10458       unsigned stride = ctx->program->info->so.strides[i];
10459       if (!stride)
10460          continue;
10461
10462       Operand off = bld.copy(bld.def(s1), Operand(i * 16u));
10463       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, off);
10464    }
10465
10466    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10467                                 get_arg(ctx, ctx->args->streamout_config), Operand(0x70010u));
10468
10469    Temp tid = emit_mbcnt(ctx, bld.def(v1));
10470
10471    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid);
10472
10473    if_context ic;
10474    begin_divergent_if_then(ctx, &ic, can_emit);
10475
10476    bld.reset(ctx->block);
10477
10478    Temp so_write_index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->streamout_write_idx), tid);
10479
10480    Temp so_write_offset[4];
10481
10482    for (unsigned i = 0; i < 4; i++) {
10483       unsigned stride = ctx->program->info->so.strides[i];
10484       if (!stride)
10485          continue;
10486
10487       if (stride == 1) {
10488          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
10489                                 get_arg(ctx, ctx->args->streamout_write_idx),
10490                                 get_arg(ctx, ctx->args->streamout_offset[i]));
10491          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
10492
10493          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
10494       } else {
10495          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
10496          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u),
10497                                  get_arg(ctx, ctx->args->streamout_offset[i]));
10498          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
10499       }
10500    }
10501
10502    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
10503       struct radv_stream_output *output =
10504          &ctx->program->info->so.outputs[i];
10505       if (stream != output->stream)
10506          continue;
10507
10508       emit_stream_output(ctx, so_buffers, so_write_offset, output);
10509    }
10510
10511    begin_divergent_if_else(ctx, &ic);
10512    end_divergent_if(ctx, &ic);
10513 }
10514
10515 } /* end namespace */
10516
10517 void fix_ls_vgpr_init_bug(isel_context *ctx, Pseudo_instruction *startpgm)
10518 {
10519    assert(ctx->shader->info.stage == MESA_SHADER_VERTEX);
10520    Builder bld(ctx->program, ctx->block);
10521    constexpr unsigned hs_idx = 1u;
10522    Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10523                                               get_arg(ctx, ctx->args->merged_wave_info),
10524                                               Operand((8u << 16) | (hs_idx * 8u)));
10525    Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp());
10526
10527    /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */
10528
10529    Temp instance_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10530                                get_arg(ctx, ctx->args->rel_auto_id),
10531                                get_arg(ctx, ctx->args->ac.instance_id),
10532                                ls_has_nonzero_hs_threads);
10533    Temp rel_auto_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10534                                get_arg(ctx, ctx->args->ac.tcs_rel_ids),
10535                                get_arg(ctx, ctx->args->rel_auto_id),
10536                                ls_has_nonzero_hs_threads);
10537    Temp vertex_id = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10538                              get_arg(ctx, ctx->args->ac.tcs_patch_id),
10539                              get_arg(ctx, ctx->args->ac.vertex_id),
10540                              ls_has_nonzero_hs_threads);
10541
10542    ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id;
10543    ctx->arg_temps[ctx->args->rel_auto_id.arg_index] = rel_auto_id;
10544    ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id;
10545 }
10546
10547 void split_arguments(isel_context *ctx, Pseudo_instruction *startpgm)
10548 {
10549    /* Split all arguments except for the first (ring_offsets) and the last
10550     * (exec) so that the dead channels don't stay live throughout the program.
10551     */
10552    for (int i = 1; i < startpgm->definitions.size() - 1; i++) {
10553       if (startpgm->definitions[i].regClass().size() > 1) {
10554          emit_split_vector(ctx, startpgm->definitions[i].getTemp(),
10555                            startpgm->definitions[i].regClass().size());
10556       }
10557    }
10558 }
10559
10560 void handle_bc_optimize(isel_context *ctx)
10561 {
10562    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
10563    Builder bld(ctx->program, ctx->block);
10564    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
10565    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
10566    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
10567    ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid);
10568    ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid);
10569    if (uses_center && uses_centroid) {
10570       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)),
10571                               get_arg(ctx, ctx->args->ac.prim_mask), Operand(0u));
10572
10573       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
10574          Temp new_coord[2];
10575          for (unsigned i = 0; i < 2; i++) {
10576             Temp persp_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1);
10577             Temp persp_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1);
10578             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10579                                     persp_centroid, persp_center, sel);
10580          }
10581          ctx->persp_centroid = bld.tmp(v2);
10582          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid),
10583                     Operand(new_coord[0]), Operand(new_coord[1]));
10584          emit_split_vector(ctx, ctx->persp_centroid, 2);
10585       }
10586
10587       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
10588          Temp new_coord[2];
10589          for (unsigned i = 0; i < 2; i++) {
10590             Temp linear_centroid = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1);
10591             Temp linear_center = emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1);
10592             new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
10593                                     linear_centroid, linear_center, sel);
10594          }
10595          ctx->linear_centroid = bld.tmp(v2);
10596          bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid),
10597                     Operand(new_coord[0]), Operand(new_coord[1]));
10598          emit_split_vector(ctx, ctx->linear_centroid, 2);
10599       }
10600    }
10601 }
10602
10603 void setup_fp_mode(isel_context *ctx, nir_shader *shader)
10604 {
10605    Program *program = ctx->program;
10606
10607    unsigned float_controls = shader->info.float_controls_execution_mode;
10608
10609    program->next_fp_mode.preserve_signed_zero_inf_nan32 =
10610       float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32;
10611    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 =
10612       float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 |
10613                         FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64);
10614
10615    program->next_fp_mode.must_flush_denorms32 =
10616       float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32;
10617    program->next_fp_mode.must_flush_denorms16_64 =
10618       float_controls & (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 |
10619                         FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64);
10620
10621    program->next_fp_mode.care_about_round32 =
10622       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32);
10623
10624    program->next_fp_mode.care_about_round16_64 =
10625       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
10626                         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
10627
10628    /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
10629     * the precision seems needed for Wolfenstein: Youngblood to render correctly */
10630    if (program->next_fp_mode.must_flush_denorms16_64)
10631       program->next_fp_mode.denorm16_64 = 0;
10632    else
10633       program->next_fp_mode.denorm16_64 = fp_denorm_keep;
10634
10635    /* preserving fp32 denorms is expensive, so only do it if asked */
10636    if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32)
10637       program->next_fp_mode.denorm32 = fp_denorm_keep;
10638    else
10639       program->next_fp_mode.denorm32 = 0;
10640
10641    if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32)
10642       program->next_fp_mode.round32 = fp_round_tz;
10643    else
10644       program->next_fp_mode.round32 = fp_round_ne;
10645
10646    if (float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64))
10647       program->next_fp_mode.round16_64 = fp_round_tz;
10648    else
10649       program->next_fp_mode.round16_64 = fp_round_ne;
10650
10651    ctx->block->fp_mode = program->next_fp_mode;
10652 }
10653
10654 void cleanup_cfg(Program *program)
10655 {
10656    /* create linear_succs/logical_succs */
10657    for (Block& BB : program->blocks) {
10658       for (unsigned idx : BB.linear_preds)
10659          program->blocks[idx].linear_succs.emplace_back(BB.index);
10660       for (unsigned idx : BB.logical_preds)
10661          program->blocks[idx].logical_succs.emplace_back(BB.index);
10662    }
10663 }
10664
10665 Temp merged_wave_info_to_mask(isel_context *ctx, unsigned i)
10666 {
10667    Builder bld(ctx->program, ctx->block);
10668
10669    /* The s_bfm only cares about s0.u[5:0] so we don't need either s_bfe nor s_and here */
10670    Temp count = i == 0
10671                 ? get_arg(ctx, ctx->args->merged_wave_info)
10672                 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc),
10673                            get_arg(ctx, ctx->args->merged_wave_info), Operand(i * 8u));
10674
10675    Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand(0u));
10676    Temp cond;
10677
10678    if (ctx->program->wave_size == 64) {
10679       /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */
10680       Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, Operand(6u /* log2(64) */));
10681       cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand(-1u), mask, bld.scc(active_64));
10682    } else {
10683       /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of the register */
10684       cond = emit_extract_vector(ctx, mask, 0, bld.lm);
10685    }
10686
10687    return cond;
10688 }
10689
10690 bool ngg_early_prim_export(isel_context *ctx)
10691 {
10692    /* TODO: Check edge flags, and if they are written, return false. (Needed for OpenGL, not for Vulkan.) */
10693    return true;
10694 }
10695
10696 void ngg_emit_sendmsg_gs_alloc_req(isel_context *ctx)
10697 {
10698    Builder bld(ctx->program, ctx->block);
10699
10700    /* It is recommended to do the GS_ALLOC_REQ as soon and as quickly as possible, so we set the maximum priority (3). */
10701    bld.sopp(aco_opcode::s_setprio, -1u, 0x3u);
10702
10703    /* Get the id of the current wave within the threadgroup (workgroup) */
10704    Builder::Result wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10705                                             get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10706
10707    /* Execute the following code only on the first wave (wave id 0),
10708     * use the SCC def to tell if the wave id is zero or not.
10709     */
10710    Temp cond = wave_id_in_tg.def(1).getTemp();
10711    if_context ic;
10712    begin_uniform_if_then(ctx, &ic, cond);
10713    begin_uniform_if_else(ctx, &ic);
10714    bld.reset(ctx->block);
10715
10716    /* Number of vertices output by VS/TES */
10717    Temp vtx_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10718                            get_arg(ctx, ctx->args->gs_tg_info), Operand(12u | (9u << 16u)));
10719    /* Number of primitives output by VS/TES */
10720    Temp prm_cnt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10721                            get_arg(ctx, ctx->args->gs_tg_info), Operand(22u | (9u << 16u)));
10722
10723    /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */
10724    Temp tmp = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand(12u));
10725    tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt);
10726
10727    /* Request the SPI to allocate space for the primitives and vertices that will be exported by the threadgroup. */
10728    bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req);
10729
10730    end_uniform_if(ctx, &ic);
10731
10732    /* After the GS_ALLOC_REQ is done, reset priority to default (0). */
10733    bld.reset(ctx->block);
10734    bld.sopp(aco_opcode::s_setprio, -1u, 0x0u);
10735 }
10736
10737 Temp ngg_get_prim_exp_arg(isel_context *ctx, unsigned num_vertices, const Temp vtxindex[])
10738 {
10739    Builder bld(ctx->program, ctx->block);
10740
10741    if (ctx->args->options->key.vs_common_out.as_ngg_passthrough) {
10742       return get_arg(ctx, ctx->args->gs_vtx_offset[0]);
10743    }
10744
10745    Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id);
10746    Temp tmp;
10747
10748    for (unsigned i = 0; i < num_vertices; ++i) {
10749       assert(vtxindex[i].id());
10750
10751       if (i)
10752          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), vtxindex[i], Operand(10u * i), tmp);
10753       else
10754          tmp = vtxindex[i];
10755
10756       /* The initial edge flag is always false in tess eval shaders. */
10757       if (ctx->stage == ngg_vertex_gs) {
10758          Temp edgeflag = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), gs_invocation_id, Operand(8 + i), Operand(1u));
10759          tmp = bld.vop3(aco_opcode::v_lshl_add_u32, bld.def(v1), edgeflag, Operand(10u * i + 9u), tmp);
10760       }
10761    }
10762
10763    /* TODO: Set isnull field in case of merged NGG VS+GS. */
10764
10765    return tmp;
10766 }
10767
10768 void ngg_emit_prim_export(isel_context *ctx, unsigned num_vertices_per_primitive, const Temp vtxindex[])
10769 {
10770    Builder bld(ctx->program, ctx->block);
10771    Temp prim_exp_arg = ngg_get_prim_exp_arg(ctx, num_vertices_per_primitive, vtxindex);
10772
10773    bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1),
10774         1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */,
10775         false /* compressed */, true/* done */, false /* valid mask */);
10776 }
10777
10778 void ngg_emit_nogs_gsthreads(isel_context *ctx)
10779 {
10780    /* Emit the things that NGG GS threads need to do, for shaders that don't have SW GS.
10781     * These must always come before VS exports.
10782     *
10783     * It is recommended to do these as early as possible. They can be at the beginning when
10784     * there is no SW GS and the shader doesn't write edge flags.
10785     */
10786
10787    if_context ic;
10788    Temp is_gs_thread = merged_wave_info_to_mask(ctx, 1);
10789    begin_divergent_if_then(ctx, &ic, is_gs_thread);
10790
10791    Builder bld(ctx->program, ctx->block);
10792    constexpr unsigned max_vertices_per_primitive = 3;
10793    unsigned num_vertices_per_primitive = max_vertices_per_primitive;
10794
10795    if (ctx->stage == ngg_vertex_gs) {
10796       /* TODO: optimize for points & lines */
10797    } else if (ctx->stage == ngg_tess_eval_gs) {
10798       if (ctx->shader->info.tess.point_mode)
10799          num_vertices_per_primitive = 1;
10800       else if (ctx->shader->info.tess.primitive_mode == GL_ISOLINES)
10801          num_vertices_per_primitive = 2;
10802    } else {
10803       unreachable("Unsupported NGG shader stage");
10804    }
10805
10806    Temp vtxindex[max_vertices_per_primitive];
10807    vtxindex[0] = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10808                           get_arg(ctx, ctx->args->gs_vtx_offset[0]));
10809    vtxindex[1] = num_vertices_per_primitive < 2 ? Temp(0, v1) :
10810                  bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1),
10811                           get_arg(ctx, ctx->args->gs_vtx_offset[0]), Operand(16u), Operand(16u));
10812    vtxindex[2] = num_vertices_per_primitive < 3 ? Temp(0, v1) :
10813                  bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0xffffu),
10814                           get_arg(ctx, ctx->args->gs_vtx_offset[2]));
10815
10816    /* Export primitive data to the index buffer. */
10817    ngg_emit_prim_export(ctx, num_vertices_per_primitive, vtxindex);
10818
10819    /* Export primitive ID. */
10820    if (ctx->stage == ngg_vertex_gs && ctx->args->options->key.vs_common_out.export_prim_id) {
10821       /* Copy Primitive IDs from GS threads to the LDS address corresponding to the ES thread of the provoking vertex. */
10822       Temp prim_id = get_arg(ctx, ctx->args->ac.gs_prim_id);
10823       Temp provoking_vtx_index = vtxindex[0];
10824       Temp addr = bld.v_mul_imm(bld.def(v1), provoking_vtx_index, 4u);
10825
10826       store_lds(ctx, 4, prim_id, 0x1u, addr, 0u, 4u);
10827    }
10828
10829    begin_divergent_if_else(ctx, &ic);
10830    end_divergent_if(ctx, &ic);
10831 }
10832
10833 void ngg_emit_nogs_output(isel_context *ctx)
10834 {
10835    /* Emits NGG GS output, for stages that don't have SW GS. */
10836
10837    if_context ic;
10838    Builder bld(ctx->program, ctx->block);
10839    bool late_prim_export = !ngg_early_prim_export(ctx);
10840
10841    /* NGG streamout is currently disabled by default. */
10842    assert(!ctx->args->shader_info->so.num_outputs);
10843
10844    if (late_prim_export) {
10845       /* VS exports are output to registers in a predecessor block. Emit phis to get them into this block. */
10846       create_export_phis(ctx);
10847       /* Do what we need to do in the GS threads. */
10848       ngg_emit_nogs_gsthreads(ctx);
10849
10850       /* What comes next should be executed on ES threads. */
10851       Temp is_es_thread = merged_wave_info_to_mask(ctx, 0);
10852       begin_divergent_if_then(ctx, &ic, is_es_thread);
10853       bld.reset(ctx->block);
10854    }
10855
10856    /* Export VS outputs */
10857    ctx->block->kind |= block_kind_export_end;
10858    create_vs_exports(ctx);
10859
10860    /* Export primitive ID */
10861    if (ctx->args->options->key.vs_common_out.export_prim_id) {
10862       Temp prim_id;
10863
10864       if (ctx->stage == ngg_vertex_gs) {
10865          /* Wait for GS threads to store primitive ID in LDS. */
10866          bld.barrier(aco_opcode::p_memory_barrier_shared);
10867          bld.sopp(aco_opcode::s_barrier);
10868
10869          /* Calculate LDS address where the GS threads stored the primitive ID. */
10870          Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
10871                                        get_arg(ctx, ctx->args->merged_wave_info), Operand(24u | (4u << 16)));
10872          Temp thread_id_in_wave = emit_mbcnt(ctx, bld.def(v1));
10873          Temp wave_id_mul = bld.v_mul24_imm(bld.def(v1), as_vgpr(ctx, wave_id_in_tg), ctx->program->wave_size);
10874          Temp thread_id_in_tg = bld.vadd32(bld.def(v1), Operand(wave_id_mul), Operand(thread_id_in_wave));
10875          Temp addr = bld.v_mul24_imm(bld.def(v1), thread_id_in_tg, 4u);
10876
10877          /* Load primitive ID from LDS. */
10878          prim_id = load_lds(ctx, 4, bld.tmp(v1), addr, 0u, 4u);
10879       } else if (ctx->stage == ngg_tess_eval_gs) {
10880          /* TES: Just use the patch ID as the primitive ID. */
10881          prim_id = get_arg(ctx, ctx->args->ac.tes_patch_id);
10882       } else {
10883          unreachable("unsupported NGG shader stage.");
10884       }
10885
10886       ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
10887       ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = prim_id;
10888
10889       export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, nullptr);
10890    }
10891
10892    if (late_prim_export) {
10893       begin_divergent_if_else(ctx, &ic);
10894       end_divergent_if(ctx, &ic);
10895       bld.reset(ctx->block);
10896    }
10897 }
10898
10899 void select_program(Program *program,
10900                     unsigned shader_count,
10901                     struct nir_shader *const *shaders,
10902                     ac_shader_config* config,
10903                     struct radv_shader_args *args)
10904 {
10905    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false);
10906    if_context ic_merged_wave_info;
10907    bool ngg_no_gs = ctx.stage == ngg_vertex_gs || ctx.stage == ngg_tess_eval_gs;
10908
10909    for (unsigned i = 0; i < shader_count; i++) {
10910       nir_shader *nir = shaders[i];
10911       init_context(&ctx, nir);
10912
10913       setup_fp_mode(&ctx, nir);
10914
10915       if (!i) {
10916          /* needs to be after init_context() for FS */
10917          Pseudo_instruction *startpgm = add_startpgm(&ctx);
10918          append_logical_start(ctx.block);
10919
10920          if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs))
10921             fix_ls_vgpr_init_bug(&ctx, startpgm);
10922
10923          split_arguments(&ctx, startpgm);
10924       }
10925
10926       if (ngg_no_gs) {
10927          ngg_emit_sendmsg_gs_alloc_req(&ctx);
10928
10929          if (ngg_early_prim_export(&ctx))
10930             ngg_emit_nogs_gsthreads(&ctx);
10931       }
10932
10933       /* In a merged VS+TCS HS, the VS implementation can be completely empty. */
10934       nir_function_impl *func = nir_shader_get_entrypoint(nir);
10935       bool empty_shader = nir_cf_list_is_empty_block(&func->body) &&
10936                           ((nir->info.stage == MESA_SHADER_VERTEX &&
10937                             (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) ||
10938                            (nir->info.stage == MESA_SHADER_TESS_EVAL &&
10939                             ctx.stage == tess_eval_geometry_gs));
10940
10941       bool check_merged_wave_info = ctx.tcs_in_out_eq ? i == 0 : ((shader_count >= 2 && !empty_shader) || ngg_no_gs);
10942       bool endif_merged_wave_info = ctx.tcs_in_out_eq ? i == 1 : check_merged_wave_info;
10943       if (check_merged_wave_info) {
10944          Temp cond = merged_wave_info_to_mask(&ctx, i);
10945          begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond);
10946       }
10947
10948       if (i) {
10949          Builder bld(ctx.program, ctx.block);
10950
10951          bld.barrier(aco_opcode::p_memory_barrier_shared);
10952          bld.sopp(aco_opcode::s_barrier);
10953
10954          if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
10955             ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
10956          }
10957       } else if (ctx.stage == geometry_gs)
10958          ctx.gs_wave_id = get_arg(&ctx, args->gs_wave_id);
10959
10960       if (ctx.stage == fragment_fs)
10961          handle_bc_optimize(&ctx);
10962
10963       visit_cf_list(&ctx, &func->body);
10964
10965       if (ctx.program->info->so.num_outputs && (ctx.stage & hw_vs))
10966          emit_streamout(&ctx, 0);
10967
10968       if (ctx.stage & hw_vs) {
10969          create_vs_exports(&ctx);
10970          ctx.block->kind |= block_kind_export_end;
10971       } else if (ngg_no_gs && ngg_early_prim_export(&ctx)) {
10972          ngg_emit_nogs_output(&ctx);
10973       } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
10974          Builder bld(ctx.program, ctx.block);
10975          bld.barrier(aco_opcode::p_memory_barrier_gs_data);
10976          bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
10977       } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
10978          write_tcs_tess_factors(&ctx);
10979       }
10980
10981       if (ctx.stage == fragment_fs) {
10982          create_fs_exports(&ctx);
10983          ctx.block->kind |= block_kind_export_end;
10984       }
10985
10986       if (endif_merged_wave_info) {
10987          begin_divergent_if_else(&ctx, &ic_merged_wave_info);
10988          end_divergent_if(&ctx, &ic_merged_wave_info);
10989       }
10990
10991       if (ngg_no_gs && !ngg_early_prim_export(&ctx))
10992          ngg_emit_nogs_output(&ctx);
10993
10994       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
10995          /* Outputs of the previous stage are inputs to the next stage */
10996          ctx.inputs = ctx.outputs;
10997          ctx.outputs = shader_io_state();
10998       }
10999    }
11000
11001    program->config->float_mode = program->blocks[0].fp_mode.val;
11002
11003    append_logical_end(ctx.block);
11004    ctx.block->kind |= block_kind_uniform;
11005    Builder bld(ctx.program, ctx.block);
11006    if (ctx.program->wb_smem_l1_on_end)
11007       bld.smem(aco_opcode::s_dcache_wb, false);
11008    bld.sopp(aco_opcode::s_endpgm);
11009
11010    cleanup_cfg(program);
11011 }
11012
11013 void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
11014                            ac_shader_config* config,
11015                            struct radv_shader_args *args)
11016 {
11017    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
11018
11019    program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
11020    program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
11021    program->next_fp_mode.must_flush_denorms32 = false;
11022    program->next_fp_mode.must_flush_denorms16_64 = false;
11023    program->next_fp_mode.care_about_round32 = false;
11024    program->next_fp_mode.care_about_round16_64 = false;
11025    program->next_fp_mode.denorm16_64 = fp_denorm_keep;
11026    program->next_fp_mode.denorm32 = 0;
11027    program->next_fp_mode.round32 = fp_round_ne;
11028    program->next_fp_mode.round16_64 = fp_round_ne;
11029    ctx.block->fp_mode = program->next_fp_mode;
11030
11031    add_startpgm(&ctx);
11032    append_logical_start(ctx.block);
11033
11034    Builder bld(ctx.program, ctx.block);
11035
11036    Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), program->private_segment_buffer, Operand(RING_GSVS_VS * 16u));
11037
11038    Operand stream_id(0u);
11039    if (args->shader_info->so.num_outputs)
11040       stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
11041                            get_arg(&ctx, ctx.args->streamout_config), Operand(0x20018u));
11042
11043    Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), get_arg(&ctx, ctx.args->ac.vertex_id));
11044
11045    std::stack<Block> endif_blocks;
11046
11047    for (unsigned stream = 0; stream < 4; stream++) {
11048       if (stream_id.isConstant() && stream != stream_id.constantValue())
11049          continue;
11050
11051       unsigned num_components = args->shader_info->gs.num_stream_output_components[stream];
11052       if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs))
11053          continue;
11054
11055       memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask));
11056
11057       unsigned BB_if_idx = ctx.block->index;
11058       Block BB_endif = Block();
11059       if (!stream_id.isConstant()) {
11060          /* begin IF */
11061          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand(stream));
11062          append_logical_end(ctx.block);
11063          ctx.block->kind |= block_kind_uniform;
11064          bld.branch(aco_opcode::p_cbranch_z, cond);
11065
11066          BB_endif.kind |= ctx.block->kind & block_kind_top_level;
11067
11068          ctx.block = ctx.program->create_and_insert_block();
11069          add_edge(BB_if_idx, ctx.block);
11070          bld.reset(ctx.block);
11071          append_logical_start(ctx.block);
11072       }
11073
11074       unsigned offset = 0;
11075       for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
11076          if (args->shader_info->gs.output_streams[i] != stream)
11077             continue;
11078
11079          unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i];
11080          unsigned length = util_last_bit(output_usage_mask);
11081          for (unsigned j = 0; j < length; ++j) {
11082             if (!(output_usage_mask & (1 << j)))
11083                continue;
11084
11085             unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4;
11086             Temp voffset = vtx_offset;
11087             if (const_offset >= 4096u) {
11088                voffset = bld.vadd32(bld.def(v1), Operand(const_offset / 4096u * 4096u), voffset);
11089                const_offset %= 4096u;
11090             }
11091
11092             aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(aco_opcode::buffer_load_dword, Format::MUBUF, 3, 1)};
11093             mubuf->definitions[0] = bld.def(v1);
11094             mubuf->operands[0] = Operand(gsvs_ring);
11095             mubuf->operands[1] = Operand(voffset);
11096             mubuf->operands[2] = Operand(0u);
11097             mubuf->offen = true;
11098             mubuf->offset = const_offset;
11099             mubuf->glc = true;
11100             mubuf->slc = true;
11101             mubuf->dlc = args->options->chip_class >= GFX10;
11102             mubuf->barrier = barrier_none;
11103             mubuf->can_reorder = true;
11104
11105             ctx.outputs.mask[i] |= 1 << j;
11106             ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
11107
11108             bld.insert(std::move(mubuf));
11109
11110             offset++;
11111          }
11112       }
11113
11114       if (args->shader_info->so.num_outputs) {
11115          emit_streamout(&ctx, stream);
11116          bld.reset(ctx.block);
11117       }
11118
11119       if (stream == 0) {
11120          create_vs_exports(&ctx);
11121          ctx.block->kind |= block_kind_export_end;
11122       }
11123
11124       if (!stream_id.isConstant()) {
11125          append_logical_end(ctx.block);
11126
11127          /* branch from then block to endif block */
11128          bld.branch(aco_opcode::p_branch);
11129          add_edge(ctx.block->index, &BB_endif);
11130          ctx.block->kind |= block_kind_uniform;
11131
11132          /* emit else block */
11133          ctx.block = ctx.program->create_and_insert_block();
11134          add_edge(BB_if_idx, ctx.block);
11135          bld.reset(ctx.block);
11136          append_logical_start(ctx.block);
11137
11138          endif_blocks.push(std::move(BB_endif));
11139       }
11140    }
11141
11142    while (!endif_blocks.empty()) {
11143       Block BB_endif = std::move(endif_blocks.top());
11144       endif_blocks.pop();
11145
11146       Block *BB_else = ctx.block;
11147
11148       append_logical_end(BB_else);
11149       /* branch from else block to endif block */
11150       bld.branch(aco_opcode::p_branch);
11151       add_edge(BB_else->index, &BB_endif);
11152       BB_else->kind |= block_kind_uniform;
11153
11154       /** emit endif merge block */
11155       ctx.block = program->insert_block(std::move(BB_endif));
11156       bld.reset(ctx.block);
11157       append_logical_start(ctx.block);
11158    }
11159
11160    program->config->float_mode = program->blocks[0].fp_mode.val;
11161
11162    append_logical_end(ctx.block);
11163    ctx.block->kind |= block_kind_uniform;
11164    bld.sopp(aco_opcode::s_endpgm);
11165
11166    cleanup_cfg(program);
11167 }
11168 }