src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <map>
  29
  30 #include "ac_shader_util.h"
  31 #include "aco_ir.h"
  32 #include "aco_builder.h"
  33 #include "aco_interface.h"
  34 #include "aco_instruction_selection_setup.cpp"
  35 #include "util/fast_idiv_by_const.h"
  36
  37 namespace aco {
  38 namespace {
  39
  40 class loop_info_RAII {
  41    isel_context* ctx;
  42    unsigned header_idx_old;
  43    Block* exit_old;
  44    bool divergent_cont_old;
  45    bool divergent_branch_old;
  46    bool divergent_if_old;
  47
  48 public:
  49    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  50       : ctx(ctx),
  51         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  52         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  53         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  54         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  55    {
  56       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  57       ctx->cf_info.parent_loop.exit = loop_exit;
  58       ctx->cf_info.parent_loop.has_divergent_continue = false;
  59       ctx->cf_info.parent_loop.has_divergent_branch = false;
  60       ctx->cf_info.parent_if.is_divergent = false;
  61       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  62    }
  63
  64    ~loop_info_RAII()
  65    {
  66       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  67       ctx->cf_info.parent_loop.exit = exit_old;
  68       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  69       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  70       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  71       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  72       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  73          ctx->cf_info.exec_potentially_empty = false;
  74    }
  75 };
  76
  77 struct if_context {
  78    Temp cond;
  79
  80    bool divergent_old;
  81    bool exec_potentially_empty_old;
  82
  83    unsigned BB_if_idx;
  84    unsigned invert_idx;
  85    bool then_branch_divergent;
  86    Block BB_invert;
  87    Block BB_endif;
  88 };
  89
  90 static void visit_cf_list(struct isel_context *ctx,
  91                           struct exec_list *list);
  92
  93 static void add_logical_edge(unsigned pred_idx, Block *succ)
  94 {
  95    succ->logical_preds.emplace_back(pred_idx);
  96 }
  97
  98
  99 static void add_linear_edge(unsigned pred_idx, Block *succ)
 100 {
 101    succ->linear_preds.emplace_back(pred_idx);
 102 }
 103
 104 static void add_edge(unsigned pred_idx, Block *succ)
 105 {
 106    add_logical_edge(pred_idx, succ);
 107    add_linear_edge(pred_idx, succ);
 108 }
 109
 110 static void append_logical_start(Block *b)
 111 {
 112    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 113 }
 114
 115 static void append_logical_end(Block *b)
 116 {
 117    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 118 }
 119
 120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 121 {
 122    assert(ctx->allocated[def->index].id());
 123    return ctx->allocated[def->index];
 124 }
 125
 126 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 127 {
 128    Builder bld(ctx->program, ctx->block);
 129
 130    if (!dst.id())
 131       dst = bld.tmp(src.regClass());
 132
 133    if (ctx->stage != fragment_fs) {
 134       if (!dst.id())
 135          return src;
 136
 137       if (src.type() == RegType::vgpr || src.size() > 1)
 138          bld.copy(Definition(dst), src);
 139       else
 140          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 141       return dst;
 142    }
 143
 144    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 145    ctx->program->needs_wqm |= program_needs_wqm;
 146    return dst;
 147 }
 148
 149 Temp as_vgpr(isel_context *ctx, Temp val)
 150 {
 151    if (val.type() == RegType::sgpr) {
 152       Builder bld(ctx->program, ctx->block);
 153       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 154    }
 155    assert(val.type() == RegType::vgpr);
 156    return val;
 157 }
 158
 159 //assumes a != 0xffffffff
 160 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 161 {
 162    assert(b != 0);
 163    Builder bld(ctx->program, ctx->block);
 164
 165    if (util_is_power_of_two_or_zero(b)) {
 166       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 167       return;
 168    }
 169
 170    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 171
 172    assert(info.multiplier <= 0xffffffff);
 173
 174    bool pre_shift = info.pre_shift != 0;
 175    bool increment = info.increment != 0;
 176    bool multiply = true;
 177    bool post_shift = info.post_shift != 0;
 178
 179    if (!pre_shift && !increment && !multiply && !post_shift) {
 180       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 181       return;
 182    }
 183
 184    Temp pre_shift_dst = a;
 185    if (pre_shift) {
 186       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 187       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 188    }
 189
 190    Temp increment_dst = pre_shift_dst;
 191    if (increment) {
 192       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 193       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 194    }
 195
 196    Temp multiply_dst = increment_dst;
 197    if (multiply) {
 198       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 199       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 200                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 201    }
 202
 203    if (post_shift) {
 204       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 205    }
 206 }
 207
 208 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 209 {
 210    Builder bld(ctx->program, ctx->block);
 211    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 212 }
 213
 214
 215 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 216 {
 217    /* no need to extract the whole vector */
 218    if (src.regClass() == dst_rc) {
 219       assert(idx == 0);
 220       return src;
 221    }
 222    assert(src.size() > idx);
 223    Builder bld(ctx->program, ctx->block);
 224    auto it = ctx->allocated_vec.find(src.id());
 225    /* the size check needs to be early because elements other than 0 may be garbage */
 226    if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
 227       if (it->second[idx].regClass() == dst_rc) {
 228          return it->second[idx];
 229       } else {
 230          assert(dst_rc.size() == it->second[idx].regClass().size());
 231          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 232          return bld.copy(bld.def(dst_rc), it->second[idx]);
 233       }
 234    }
 235
 236    if (src.size() == dst_rc.size()) {
 237       assert(idx == 0);
 238       return bld.copy(bld.def(dst_rc), src);
 239    } else {
 240       Temp dst = bld.tmp(dst_rc);
 241       emit_extract_vector(ctx, src, idx, dst);
 242       return dst;
 243    }
 244 }
 245
 246 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 247 {
 248    if (num_components == 1)
 249       return;
 250    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 251       return;
 252    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 253    split->operands[0] = Operand(vec_src);
 254    std::array<Temp,4> elems;
 255    for (unsigned i = 0; i < num_components; i++) {
 256       elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
 257       split->definitions[i] = Definition(elems[i]);
 258    }
 259    ctx->block->instructions.emplace_back(std::move(split));
 260    ctx->allocated_vec.emplace(vec_src.id(), elems);
 261 }
 262
 263 /* This vector expansion uses a mask to determine which elements in the new vector
 264  * come from the original vector. The other elements are undefined. */
 265 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 266 {
 267    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 268
 269    if (vec_src == dst)
 270       return;
 271
 272    Builder bld(ctx->program, ctx->block);
 273    if (num_components == 1) {
 274       if (dst.type() == RegType::sgpr)
 275          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 276       else
 277          bld.copy(Definition(dst), vec_src);
 278       return;
 279    }
 280
 281    unsigned component_size = dst.size() / num_components;
 282    std::array<Temp,4> elems;
 283
 284    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 285    vec->definitions[0] = Definition(dst);
 286    unsigned k = 0;
 287    for (unsigned i = 0; i < num_components; i++) {
 288       if (mask & (1 << i)) {
 289          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 290          if (dst.type() == RegType::sgpr)
 291             src = bld.as_uniform(src);
 292          vec->operands[i] = Operand(src);
 293       } else {
 294          vec->operands[i] = Operand(0u);
 295       }
 296       elems[i] = vec->operands[i].getTemp();
 297    }
 298    ctx->block->instructions.emplace_back(std::move(vec));
 299    ctx->allocated_vec.emplace(dst.id(), elems);
 300 }
 301
 302 Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
 303 {
 304    if (val.regClass() == s2) {
 305       return val;
 306    } else {
 307       assert(val.regClass() == s1);
 308       Builder bld(ctx->program, ctx->block);
 309       Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
 310                                  Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
 311       if (vcc_hint)
 312          def.setHint(vcc);
 313       return def.getTemp();
 314    }
 315 }
 316
 317 Temp as_uniform_bool(isel_context *ctx, Temp val)
 318 {
 319    if (val.regClass() == s1) {
 320       return val;
 321    } else {
 322       assert(val.regClass() == s2);
 323       Builder bld(ctx->program, ctx->block);
 324       /* if we're currently in WQM mode, ensure that the source is also computed in WQM */
 325       return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), emit_wqm(ctx, val));
 326    }
 327 }
 328
 329 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 330 {
 331    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 332       return get_ssa_temp(ctx, src.src.ssa);
 333
 334    if (src.src.ssa->num_components == size) {
 335       bool identity_swizzle = true;
 336       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 337          if (src.swizzle[i] != i)
 338             identity_swizzle = false;
 339       }
 340       if (identity_swizzle)
 341          return get_ssa_temp(ctx, src.src.ssa);
 342    }
 343
 344    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 345    unsigned elem_size = vec.size() / src.src.ssa->num_components;
 346    assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
 347    assert(vec.size() % elem_size == 0);
 348
 349    RegClass elem_rc = RegClass(vec.type(), elem_size);
 350    if (size == 1) {
 351       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 352    } else {
 353       assert(size <= 4);
 354       std::array<Temp,4> elems;
 355       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 356       for (unsigned i = 0; i < size; ++i) {
 357          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 358          vec_instr->operands[i] = Operand{elems[i]};
 359       }
 360       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
 361       vec_instr->definitions[0] = Definition(dst);
 362       ctx->block->instructions.emplace_back(std::move(vec_instr));
 363       ctx->allocated_vec.emplace(dst.id(), elems);
 364       return dst;
 365    }
 366 }
 367
 368 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 369 {
 370    if (ptr.size() == 2)
 371       return ptr;
 372    Builder bld(ctx->program, ctx->block);
 373    if (ptr.type() == RegType::vgpr)
 374       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 375    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 376                      ptr, Operand((unsigned)ctx->options->address32_hi));
 377 }
 378
 379 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 380 {
 381    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 382    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 383    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 384    sop2->definitions[0] = Definition(dst);
 385    if (writes_scc)
 386       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 387    ctx->block->instructions.emplace_back(std::move(sop2));
 388 }
 389
 390 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
 391 {
 392    Builder bld(ctx->program, ctx->block);
 393    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 394    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 395    if (src1.type() == RegType::sgpr) {
 396       if (commutative && src0.type() == RegType::vgpr) {
 397          Temp t = src0;
 398          src0 = src1;
 399          src1 = t;
 400       } else if (src0.type() == RegType::vgpr &&
 401                  op != aco_opcode::v_madmk_f32 &&
 402                  op != aco_opcode::v_madak_f32 &&
 403                  op != aco_opcode::v_madmk_f16 &&
 404                  op != aco_opcode::v_madak_f16) {
 405          /* If the instruction is not commutative, we emit a VOP3A instruction */
 406          bld.vop2_e64(op, Definition(dst), src0, src1);
 407          return;
 408       } else {
 409          src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
 410       }
 411    }
 412    bld.vop2(op, Definition(dst), src0, src1);
 413 }
 414
 415 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 416 {
 417    Temp src0 = get_alu_src(ctx, instr->src[0]);
 418    Temp src1 = get_alu_src(ctx, instr->src[1]);
 419    Temp src2 = get_alu_src(ctx, instr->src[2]);
 420
 421    /* ensure that the instruction has at most 1 sgpr operand
 422     * The optimizer will inline constants for us */
 423    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 424       src0 = as_vgpr(ctx, src0);
 425    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 426       src1 = as_vgpr(ctx, src1);
 427    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 428       src2 = as_vgpr(ctx, src2);
 429
 430    Builder bld(ctx->program, ctx->block);
 431    bld.vop3(op, Definition(dst), src0, src1, src2);
 432 }
 433
 434 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 435 {
 436    Builder bld(ctx->program, ctx->block);
 437    bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 438 }
 439
 440 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 441 {
 442    Temp src0 = get_alu_src(ctx, instr->src[0]);
 443    Temp src1 = get_alu_src(ctx, instr->src[1]);
 444    aco_ptr<Instruction> vopc;
 445    if (src1.type() == RegType::sgpr) {
 446       if (src0.type() == RegType::vgpr) {
 447          /* to swap the operands, we might also have to change the opcode */
 448          switch (op) {
 449             case aco_opcode::v_cmp_lt_f32:
 450                op = aco_opcode::v_cmp_gt_f32;
 451                break;
 452             case aco_opcode::v_cmp_ge_f32:
 453                op = aco_opcode::v_cmp_le_f32;
 454                break;
 455             case aco_opcode::v_cmp_lt_i32:
 456                op = aco_opcode::v_cmp_gt_i32;
 457                break;
 458             case aco_opcode::v_cmp_ge_i32:
 459                op = aco_opcode::v_cmp_le_i32;
 460                break;
 461             case aco_opcode::v_cmp_lt_u32:
 462                op = aco_opcode::v_cmp_gt_u32;
 463                break;
 464             case aco_opcode::v_cmp_ge_u32:
 465                op = aco_opcode::v_cmp_le_u32;
 466                break;
 467             case aco_opcode::v_cmp_lt_f64:
 468                op = aco_opcode::v_cmp_gt_f64;
 469                break;
 470             case aco_opcode::v_cmp_ge_f64:
 471                op = aco_opcode::v_cmp_le_f64;
 472                break;
 473             case aco_opcode::v_cmp_lt_i64:
 474                op = aco_opcode::v_cmp_gt_i64;
 475                break;
 476             case aco_opcode::v_cmp_ge_i64:
 477                op = aco_opcode::v_cmp_le_i64;
 478                break;
 479             case aco_opcode::v_cmp_lt_u64:
 480                op = aco_opcode::v_cmp_gt_u64;
 481                break;
 482             case aco_opcode::v_cmp_ge_u64:
 483                op = aco_opcode::v_cmp_le_u64;
 484                break;
 485             default: /* eq and ne are commutative */
 486                break;
 487          }
 488          Temp t = src0;
 489          src0 = src1;
 490          src1 = t;
 491       } else {
 492          src1 = as_vgpr(ctx, src1);
 493       }
 494    }
 495    Builder bld(ctx->program, ctx->block);
 496    bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
 497 }
 498
 499 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 500 {
 501    if (dst.regClass() == s2) {
 502       emit_vopc_instruction(ctx, instr, op, dst);
 503       if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
 504          emit_split_vector(ctx, dst, 2);
 505    } else if (dst.regClass() == s1) {
 506       Temp src0 = get_alu_src(ctx, instr->src[0]);
 507       Temp src1 = get_alu_src(ctx, instr->src[1]);
 508       assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
 509
 510       Builder bld(ctx->program, ctx->block);
 511       bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
 512
 513    } else {
 514       assert(false);
 515    }
 516 }
 517
 518 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
 519 {
 520    Builder bld(ctx->program, ctx->block);
 521    Temp src0 = get_alu_src(ctx, instr->src[0]);
 522    Temp src1 = get_alu_src(ctx, instr->src[1]);
 523    if (dst.regClass() == s2) {
 524       bld.sop2(op64, Definition(dst), bld.def(s1, scc),
 525                as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
 526    } else {
 527       assert(dst.regClass() == s1);
 528       bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
 529                as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
 530    }
 531 }
 532
 533
 534 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 535 {
 536    Builder bld(ctx->program, ctx->block);
 537    Temp cond = get_alu_src(ctx, instr->src[0]);
 538    Temp then = get_alu_src(ctx, instr->src[1]);
 539    Temp els = get_alu_src(ctx, instr->src[2]);
 540
 541    if (dst.type() == RegType::vgpr) {
 542       cond = as_divergent_bool(ctx, cond, true);
 543
 544       aco_ptr<Instruction> bcsel;
 545       if (dst.size() == 1) {
 546          then = as_vgpr(ctx, then);
 547          els = as_vgpr(ctx, els);
 548
 549          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 550       } else if (dst.size() == 2) {
 551          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 552          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 553          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 554          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 555
 556          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 557          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 558
 559          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 560       } else {
 561          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 562          nir_print_instr(&instr->instr, stderr);
 563          fprintf(stderr, "\n");
 564       }
 565       return;
 566    }
 567
 568    if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
 569       if (dst.regClass() == s1 || dst.regClass() == s2) {
 570          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 571          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 572          bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
 573       } else {
 574          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 575          nir_print_instr(&instr->instr, stderr);
 576          fprintf(stderr, "\n");
 577       }
 578       return;
 579    }
 580
 581    /* boolean bcsel */
 582    assert(instr->dest.dest.ssa.bit_size == 1);
 583
 584    if (dst.regClass() == s1)
 585       cond = as_uniform_bool(ctx, cond);
 586
 587    if (cond.regClass() == s1) { /* uniform selection */
 588       aco_opcode op;
 589       if (dst.regClass() == s2) {
 590          op = aco_opcode::s_cselect_b64;
 591          then = as_divergent_bool(ctx, then, false);
 592          els = as_divergent_bool(ctx, els, false);
 593       } else {
 594          assert(dst.regClass() == s1);
 595          op = aco_opcode::s_cselect_b32;
 596          then = as_uniform_bool(ctx, then);
 597          els = as_uniform_bool(ctx, els);
 598       }
 599       bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
 600       return;
 601    }
 602
 603    /* divergent boolean bcsel
 604     * this implements bcsel on bools: dst = s0 ? s1 : s2
 605     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 606    assert (dst.regClass() == s2);
 607    then = as_divergent_bool(ctx, then, false);
 608    els = as_divergent_bool(ctx, els, false);
 609
 610    if (cond.id() != then.id())
 611       then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
 612
 613    if (cond.id() == els.id())
 614       bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
 615    else
 616       bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
 617                bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
 618 }
 619
 620 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 621 {
 622    if (!instr->dest.dest.is_ssa) {
 623       fprintf(stderr, "nir alu dst not in ssa: ");
 624       nir_print_instr(&instr->instr, stderr);
 625       fprintf(stderr, "\n");
 626       abort();
 627    }
 628    Builder bld(ctx->program, ctx->block);
 629    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
 630    switch(instr->op) {
 631    case nir_op_vec2:
 632    case nir_op_vec3:
 633    case nir_op_vec4: {
 634       std::array<Temp,4> elems;
 635       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
 636       for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
 637          elems[i] = get_alu_src(ctx, instr->src[i]);
 638          vec->operands[i] = Operand{elems[i]};
 639       }
 640       vec->definitions[0] = Definition(dst);
 641       ctx->block->instructions.emplace_back(std::move(vec));
 642       ctx->allocated_vec.emplace(dst.id(), elems);
 643       break;
 644    }
 645    case nir_op_mov: {
 646       Temp src = get_alu_src(ctx, instr->src[0]);
 647       aco_ptr<Instruction> mov;
 648       if (dst.type() == RegType::sgpr) {
 649          if (src.type() == RegType::vgpr)
 650             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
 651          else if (src.regClass() == s1)
 652             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 653          else if (src.regClass() == s2)
 654             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
 655          else
 656             unreachable("wrong src register class for nir_op_imov");
 657       } else if (dst.regClass() == v1) {
 658          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
 659       } else if (dst.regClass() == v2) {
 660          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
 661       } else {
 662          nir_print_instr(&instr->instr, stderr);
 663          unreachable("Should have been lowered to scalar.");
 664       }
 665       break;
 666    }
 667    case nir_op_inot: {
 668       Temp src = get_alu_src(ctx, instr->src[0]);
 669       /* uniform booleans */
 670       if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
 671          if (src.regClass() == s1) {
 672             /* in this case, src is either 1 or 0 */
 673             bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
 674          } else {
 675             /* src is either exec_mask or 0 */
 676             assert(src.regClass() == s2);
 677             bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
 678          }
 679       } else if (dst.regClass() == v1) {
 680          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
 681       } else if (dst.type() == RegType::sgpr) {
 682          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
 683          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
 684       } else {
 685          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 686          nir_print_instr(&instr->instr, stderr);
 687          fprintf(stderr, "\n");
 688       }
 689       break;
 690    }
 691    case nir_op_ineg: {
 692       Temp src = get_alu_src(ctx, instr->src[0]);
 693       if (dst.regClass() == v1) {
 694          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
 695       } else if (dst.regClass() == s1) {
 696          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
 697       } else if (dst.size() == 2) {
 698          Temp src0 = bld.tmp(dst.type(), 1);
 699          Temp src1 = bld.tmp(dst.type(), 1);
 700          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
 701
 702          if (dst.regClass() == s2) {
 703             Temp carry = bld.tmp(s1);
 704             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
 705             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
 706             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 707          } else {
 708             Temp lower = bld.tmp(v1);
 709             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
 710             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
 711             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 712          }
 713       } else {
 714          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 715          nir_print_instr(&instr->instr, stderr);
 716          fprintf(stderr, "\n");
 717       }
 718       break;
 719    }
 720    case nir_op_iabs: {
 721       if (dst.regClass() == s1) {
 722          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
 723       } else if (dst.regClass() == v1) {
 724          Temp src = get_alu_src(ctx, instr->src[0]);
 725          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
 726       } else {
 727          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 728          nir_print_instr(&instr->instr, stderr);
 729          fprintf(stderr, "\n");
 730       }
 731       break;
 732    }
 733    case nir_op_isign: {
 734       Temp src = get_alu_src(ctx, instr->src[0]);
 735       if (dst.regClass() == s1) {
 736          Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
 737          Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
 738          bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
 739       } else if (dst.regClass() == s2) {
 740          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
 741          Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
 742          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
 743       } else if (dst.regClass() == v1) {
 744          Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
 745          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 746          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
 747       } else if (dst.regClass() == v2) {
 748          Temp upper = emit_extract_vector(ctx, src, 1, v1);
 749          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
 750          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 751          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
 752          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
 753          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 754       } else {
 755          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 756          nir_print_instr(&instr->instr, stderr);
 757          fprintf(stderr, "\n");
 758       }
 759       break;
 760    }
 761    case nir_op_imax: {
 762       if (dst.regClass() == v1) {
 763          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
 764       } else if (dst.regClass() == s1) {
 765          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
 766       } else {
 767          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 768          nir_print_instr(&instr->instr, stderr);
 769          fprintf(stderr, "\n");
 770       }
 771       break;
 772    }
 773    case nir_op_umax: {
 774       if (dst.regClass() == v1) {
 775          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
 776       } else if (dst.regClass() == s1) {
 777          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
 778       } else {
 779          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 780          nir_print_instr(&instr->instr, stderr);
 781          fprintf(stderr, "\n");
 782       }
 783       break;
 784    }
 785    case nir_op_imin: {
 786       if (dst.regClass() == v1) {
 787          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
 788       } else if (dst.regClass() == s1) {
 789          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
 790       } else {
 791          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 792          nir_print_instr(&instr->instr, stderr);
 793          fprintf(stderr, "\n");
 794       }
 795       break;
 796    }
 797    case nir_op_umin: {
 798       if (dst.regClass() == v1) {
 799          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
 800       } else if (dst.regClass() == s1) {
 801          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
 802       } else {
 803          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 804          nir_print_instr(&instr->instr, stderr);
 805          fprintf(stderr, "\n");
 806       }
 807       break;
 808    }
 809    case nir_op_ior: {
 810       if (instr->dest.dest.ssa.bit_size == 1) {
 811          emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
 812       } else if (dst.regClass() == v1) {
 813          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
 814       } else if (dst.regClass() == s1) {
 815          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
 816       } else if (dst.regClass() == s2) {
 817          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
 818       } else {
 819          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 820          nir_print_instr(&instr->instr, stderr);
 821          fprintf(stderr, "\n");
 822       }
 823       break;
 824    }
 825    case nir_op_iand: {
 826       if (instr->dest.dest.ssa.bit_size == 1) {
 827          emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
 828       } else if (dst.regClass() == v1) {
 829          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
 830       } else if (dst.regClass() == s1) {
 831          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
 832       } else if (dst.regClass() == s2) {
 833          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
 834       } else {
 835          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 836          nir_print_instr(&instr->instr, stderr);
 837          fprintf(stderr, "\n");
 838       }
 839       break;
 840    }
 841    case nir_op_ixor: {
 842       if (instr->dest.dest.ssa.bit_size == 1) {
 843          emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
 844       } else if (dst.regClass() == v1) {
 845          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
 846       } else if (dst.regClass() == s1) {
 847          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
 848       } else if (dst.regClass() == s2) {
 849          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
 850       } else {
 851          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 852          nir_print_instr(&instr->instr, stderr);
 853          fprintf(stderr, "\n");
 854       }
 855       break;
 856    }
 857    case nir_op_ushr: {
 858       if (dst.regClass() == v1) {
 859          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
 860       } else if (dst.regClass() == v2) {
 861          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
 862                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 863       } else if (dst.regClass() == s2) {
 864          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
 865       } else if (dst.regClass() == s1) {
 866          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
 867       } else {
 868          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 869          nir_print_instr(&instr->instr, stderr);
 870          fprintf(stderr, "\n");
 871       }
 872       break;
 873    }
 874    case nir_op_ishl: {
 875       if (dst.regClass() == v1) {
 876          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
 877       } else if (dst.regClass() == v2) {
 878          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
 879                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 880       } else if (dst.regClass() == s1) {
 881          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
 882       } else if (dst.regClass() == s2) {
 883          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
 884       } else {
 885          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 886          nir_print_instr(&instr->instr, stderr);
 887          fprintf(stderr, "\n");
 888       }
 889       break;
 890    }
 891    case nir_op_ishr: {
 892       if (dst.regClass() == v1) {
 893          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
 894       } else if (dst.regClass() == v2) {
 895          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
 896                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 897       } else if (dst.regClass() == s1) {
 898          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
 899       } else if (dst.regClass() == s2) {
 900          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
 901       } else {
 902          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 903          nir_print_instr(&instr->instr, stderr);
 904          fprintf(stderr, "\n");
 905       }
 906       break;
 907    }
 908    case nir_op_find_lsb: {
 909       Temp src = get_alu_src(ctx, instr->src[0]);
 910       if (src.regClass() == s1) {
 911          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
 912       } else if (src.regClass() == v1) {
 913          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
 914       } else if (src.regClass() == s2) {
 915          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
 916       } else {
 917          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 918          nir_print_instr(&instr->instr, stderr);
 919          fprintf(stderr, "\n");
 920       }
 921       break;
 922    }
 923    case nir_op_ufind_msb:
 924    case nir_op_ifind_msb: {
 925       Temp src = get_alu_src(ctx, instr->src[0]);
 926       if (src.regClass() == s1 || src.regClass() == s2) {
 927          aco_opcode op = src.regClass() == s2 ?
 928                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
 929                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
 930          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
 931
 932          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
 933                                         Operand(src.size() * 32u - 1u), msb_rev);
 934          Temp msb = sub.def(0).getTemp();
 935          Temp carry = sub.def(1).getTemp();
 936
 937          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
 938       } else if (src.regClass() == v1) {
 939          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
 940          Temp msb_rev = bld.tmp(v1);
 941          emit_vop1_instruction(ctx, instr, op, msb_rev);
 942          Temp msb = bld.tmp(v1);
 943          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
 944          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
 945       } else {
 946          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 947          nir_print_instr(&instr->instr, stderr);
 948          fprintf(stderr, "\n");
 949       }
 950       break;
 951    }
 952    case nir_op_bitfield_reverse: {
 953       if (dst.regClass() == s1) {
 954          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 955       } else if (dst.regClass() == v1) {
 956          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 957       } else {
 958          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 959          nir_print_instr(&instr->instr, stderr);
 960          fprintf(stderr, "\n");
 961       }
 962       break;
 963    }
 964    case nir_op_iadd: {
 965       if (dst.regClass() == s1) {
 966          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
 967          break;
 968       }
 969
 970       Temp src0 = get_alu_src(ctx, instr->src[0]);
 971       Temp src1 = get_alu_src(ctx, instr->src[1]);
 972       if (dst.regClass() == v1) {
 973          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
 974          break;
 975       }
 976
 977       assert(src0.size() == 2 && src1.size() == 2);
 978       Temp src00 = bld.tmp(src0.type(), 1);
 979       Temp src01 = bld.tmp(dst.type(), 1);
 980       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 981       Temp src10 = bld.tmp(src1.type(), 1);
 982       Temp src11 = bld.tmp(dst.type(), 1);
 983       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 984
 985       if (dst.regClass() == s2) {
 986          Temp carry = bld.tmp(s1);
 987          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
 988          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
 989          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 990       } else if (dst.regClass() == v2) {
 991          Temp dst0 = bld.tmp(v1);
 992          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
 993          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
 994          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 995       } else {
 996          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 997          nir_print_instr(&instr->instr, stderr);
 998          fprintf(stderr, "\n");
 999       }
1000       break;
1001    }
1002    case nir_op_uadd_sat: {
1003       Temp src0 = get_alu_src(ctx, instr->src[0]);
1004       Temp src1 = get_alu_src(ctx, instr->src[1]);
1005       if (dst.regClass() == s1) {
1006          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1007          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1008                   src0, src1);
1009          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1010       } else if (dst.regClass() == v1) {
1011          if (ctx->options->chip_class >= GFX9) {
1012             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1013             add->operands[0] = Operand(src0);
1014             add->operands[1] = Operand(src1);
1015             add->definitions[0] = Definition(dst);
1016             add->clamp = 1;
1017             ctx->block->instructions.emplace_back(std::move(add));
1018          } else {
1019             if (src1.regClass() != v1)
1020                std::swap(src0, src1);
1021             assert(src1.regClass() == v1);
1022             Temp tmp = bld.tmp(v1);
1023             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1024             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1025          }
1026       } else {
1027          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1028          nir_print_instr(&instr->instr, stderr);
1029          fprintf(stderr, "\n");
1030       }
1031       break;
1032    }
1033    case nir_op_uadd_carry: {
1034       Temp src0 = get_alu_src(ctx, instr->src[0]);
1035       Temp src1 = get_alu_src(ctx, instr->src[1]);
1036       if (dst.regClass() == s1) {
1037          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1038          break;
1039       }
1040       if (dst.regClass() == v1) {
1041          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1042          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1043          break;
1044       }
1045
1046       Temp src00 = bld.tmp(src0.type(), 1);
1047       Temp src01 = bld.tmp(dst.type(), 1);
1048       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1049       Temp src10 = bld.tmp(src1.type(), 1);
1050       Temp src11 = bld.tmp(dst.type(), 1);
1051       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1052       if (dst.regClass() == s2) {
1053          Temp carry = bld.tmp(s1);
1054          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1055          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1056          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1057       } else if (dst.regClass() == v2) {
1058          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1059          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1060          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1061          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1062       } else {
1063          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1064          nir_print_instr(&instr->instr, stderr);
1065          fprintf(stderr, "\n");
1066       }
1067       break;
1068    }
1069    case nir_op_isub: {
1070       if (dst.regClass() == s1) {
1071          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1072          break;
1073       }
1074
1075       Temp src0 = get_alu_src(ctx, instr->src[0]);
1076       Temp src1 = get_alu_src(ctx, instr->src[1]);
1077       if (dst.regClass() == v1) {
1078          bld.vsub32(Definition(dst), src0, src1);
1079          break;
1080       }
1081
1082       Temp src00 = bld.tmp(src0.type(), 1);
1083       Temp src01 = bld.tmp(dst.type(), 1);
1084       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1085       Temp src10 = bld.tmp(src1.type(), 1);
1086       Temp src11 = bld.tmp(dst.type(), 1);
1087       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1088       if (dst.regClass() == s2) {
1089          Temp carry = bld.tmp(s1);
1090          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1091          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1092          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1093       } else if (dst.regClass() == v2) {
1094          Temp lower = bld.tmp(v1);
1095          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1096          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1097          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1098       } else {
1099          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1100          nir_print_instr(&instr->instr, stderr);
1101          fprintf(stderr, "\n");
1102       }
1103       break;
1104    }
1105    case nir_op_usub_borrow: {
1106       Temp src0 = get_alu_src(ctx, instr->src[0]);
1107       Temp src1 = get_alu_src(ctx, instr->src[1]);
1108       if (dst.regClass() == s1) {
1109          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1110          break;
1111       } else if (dst.regClass() == v1) {
1112          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1113          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1114          break;
1115       }
1116
1117       Temp src00 = bld.tmp(src0.type(), 1);
1118       Temp src01 = bld.tmp(dst.type(), 1);
1119       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1120       Temp src10 = bld.tmp(src1.type(), 1);
1121       Temp src11 = bld.tmp(dst.type(), 1);
1122       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1123       if (dst.regClass() == s2) {
1124          Temp borrow = bld.tmp(s1);
1125          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1126          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1127          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1128       } else if (dst.regClass() == v2) {
1129          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1130          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1131          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1132          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1133       } else {
1134          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1135          nir_print_instr(&instr->instr, stderr);
1136          fprintf(stderr, "\n");
1137       }
1138       break;
1139    }
1140    case nir_op_imul: {
1141       if (dst.regClass() == v1) {
1142          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1143                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1144       } else if (dst.regClass() == s1) {
1145          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1146       } else {
1147          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1148          nir_print_instr(&instr->instr, stderr);
1149          fprintf(stderr, "\n");
1150       }
1151       break;
1152    }
1153    case nir_op_umul_high: {
1154       if (dst.regClass() == v1) {
1155          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1156       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1157          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1158       } else if (dst.regClass() == s1) {
1159          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1160                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1161          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1162       } else {
1163          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1164          nir_print_instr(&instr->instr, stderr);
1165          fprintf(stderr, "\n");
1166       }
1167       break;
1168    }
1169    case nir_op_imul_high: {
1170       if (dst.regClass() == v1) {
1171          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1172       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1173          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1174       } else if (dst.regClass() == s1) {
1175          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1176                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1177          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1178       } else {
1179          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1180          nir_print_instr(&instr->instr, stderr);
1181          fprintf(stderr, "\n");
1182       }
1183       break;
1184    }
1185    case nir_op_fmul: {
1186       if (dst.size() == 1) {
1187          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1188       } else if (dst.size() == 2) {
1189          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1190                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1191       } else {
1192          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1193          nir_print_instr(&instr->instr, stderr);
1194          fprintf(stderr, "\n");
1195       }
1196       break;
1197    }
1198    case nir_op_fadd: {
1199       if (dst.size() == 1) {
1200          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1201       } else if (dst.size() == 2) {
1202          bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1203                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1204       } else {
1205          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1206          nir_print_instr(&instr->instr, stderr);
1207          fprintf(stderr, "\n");
1208       }
1209       break;
1210    }
1211    case nir_op_fsub: {
1212       Temp src0 = get_alu_src(ctx, instr->src[0]);
1213       Temp src1 = get_alu_src(ctx, instr->src[1]);
1214       if (dst.size() == 1) {
1215          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1216             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1217          else
1218             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1219       } else if (dst.size() == 2) {
1220          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1221                                      get_alu_src(ctx, instr->src[0]),
1222                                      as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1223          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1224          sub->neg[1] = true;
1225       } else {
1226          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1227          nir_print_instr(&instr->instr, stderr);
1228          fprintf(stderr, "\n");
1229       }
1230       break;
1231    }
1232    case nir_op_fmax: {
1233       if (dst.size() == 1) {
1234          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1235       } else if (dst.size() == 2) {
1236          bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1237                   get_alu_src(ctx, instr->src[0]),
1238                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1239       } else {
1240          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1241          nir_print_instr(&instr->instr, stderr);
1242          fprintf(stderr, "\n");
1243       }
1244       break;
1245    }
1246    case nir_op_fmin: {
1247       if (dst.size() == 1) {
1248          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1249       } else if (dst.size() == 2) {
1250          bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1251                   get_alu_src(ctx, instr->src[0]),
1252                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1253       } else {
1254          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1255          nir_print_instr(&instr->instr, stderr);
1256          fprintf(stderr, "\n");
1257       }
1258       break;
1259    }
1260    case nir_op_fmax3: {
1261       if (dst.size() == 1) {
1262          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1263       } else {
1264          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1265          nir_print_instr(&instr->instr, stderr);
1266          fprintf(stderr, "\n");
1267       }
1268       break;
1269    }
1270    case nir_op_fmin3: {
1271       if (dst.size() == 1) {
1272          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1273       } else {
1274          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1275          nir_print_instr(&instr->instr, stderr);
1276          fprintf(stderr, "\n");
1277       }
1278       break;
1279    }
1280    case nir_op_fmed3: {
1281       if (dst.size() == 1) {
1282          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1283       } else {
1284          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1285          nir_print_instr(&instr->instr, stderr);
1286          fprintf(stderr, "\n");
1287       }
1288       break;
1289    }
1290    case nir_op_umax3: {
1291       if (dst.size() == 1) {
1292          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1293       } else {
1294          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1295          nir_print_instr(&instr->instr, stderr);
1296          fprintf(stderr, "\n");
1297       }
1298       break;
1299    }
1300    case nir_op_umin3: {
1301       if (dst.size() == 1) {
1302          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1303       } else {
1304          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1305          nir_print_instr(&instr->instr, stderr);
1306          fprintf(stderr, "\n");
1307       }
1308       break;
1309    }
1310    case nir_op_umed3: {
1311       if (dst.size() == 1) {
1312          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1313       } else {
1314          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1315          nir_print_instr(&instr->instr, stderr);
1316          fprintf(stderr, "\n");
1317       }
1318       break;
1319    }
1320    case nir_op_imax3: {
1321       if (dst.size() == 1) {
1322          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1323       } else {
1324          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1325          nir_print_instr(&instr->instr, stderr);
1326          fprintf(stderr, "\n");
1327       }
1328       break;
1329    }
1330    case nir_op_imin3: {
1331       if (dst.size() == 1) {
1332          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1333       } else {
1334          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1335          nir_print_instr(&instr->instr, stderr);
1336          fprintf(stderr, "\n");
1337       }
1338       break;
1339    }
1340    case nir_op_imed3: {
1341       if (dst.size() == 1) {
1342          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1343       } else {
1344          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1345          nir_print_instr(&instr->instr, stderr);
1346          fprintf(stderr, "\n");
1347       }
1348       break;
1349    }
1350    case nir_op_cube_face_coord: {
1351       Temp in = get_alu_src(ctx, instr->src[0], 3);
1352       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1353                       emit_extract_vector(ctx, in, 1, v1),
1354                       emit_extract_vector(ctx, in, 2, v1) };
1355       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1356       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1357       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1358       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1359       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1360       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1361       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1362       break;
1363    }
1364    case nir_op_cube_face_index: {
1365       Temp in = get_alu_src(ctx, instr->src[0], 3);
1366       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1367                       emit_extract_vector(ctx, in, 1, v1),
1368                       emit_extract_vector(ctx, in, 2, v1) };
1369       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1370       break;
1371    }
1372    case nir_op_bcsel: {
1373       emit_bcsel(ctx, instr, dst);
1374       break;
1375    }
1376    case nir_op_frsq: {
1377       if (dst.size() == 1) {
1378          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
1379       } else if (dst.size() == 2) {
1380          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1381       } else {
1382          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1383          nir_print_instr(&instr->instr, stderr);
1384          fprintf(stderr, "\n");
1385       }
1386       break;
1387    }
1388    case nir_op_fneg: {
1389       Temp src = get_alu_src(ctx, instr->src[0]);
1390       if (dst.size() == 1) {
1391          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1392       } else if (dst.size() == 2) {
1393          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1394          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1395          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1396          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1397       } else {
1398          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1399          nir_print_instr(&instr->instr, stderr);
1400          fprintf(stderr, "\n");
1401       }
1402       break;
1403    }
1404    case nir_op_fabs: {
1405       Temp src = get_alu_src(ctx, instr->src[0]);
1406       if (dst.size() == 1) {
1407          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1408       } else if (dst.size() == 2) {
1409          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1410          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1411          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1412          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1413       } else {
1414          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1415          nir_print_instr(&instr->instr, stderr);
1416          fprintf(stderr, "\n");
1417       }
1418       break;
1419    }
1420    case nir_op_fsat: {
1421       Temp src = get_alu_src(ctx, instr->src[0]);
1422       if (dst.size() == 1) {
1423          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1424       } else if (dst.size() == 2) {
1425          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1426          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1427          vop3->clamp = true;
1428       } else {
1429          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1430          nir_print_instr(&instr->instr, stderr);
1431          fprintf(stderr, "\n");
1432       }
1433       break;
1434    }
1435    case nir_op_flog2: {
1436       if (dst.size() == 1) {
1437          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
1438       } else {
1439          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1440          nir_print_instr(&instr->instr, stderr);
1441          fprintf(stderr, "\n");
1442       }
1443       break;
1444    }
1445    case nir_op_frcp: {
1446       if (dst.size() == 1) {
1447          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
1448       } else if (dst.size() == 2) {
1449          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1450       } else {
1451          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1452          nir_print_instr(&instr->instr, stderr);
1453          fprintf(stderr, "\n");
1454       }
1455       break;
1456    }
1457    case nir_op_fexp2: {
1458       if (dst.size() == 1) {
1459          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1460       } else {
1461          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1462          nir_print_instr(&instr->instr, stderr);
1463          fprintf(stderr, "\n");
1464       }
1465       break;
1466    }
1467    case nir_op_fsqrt: {
1468       if (dst.size() == 1) {
1469          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
1470       } else if (dst.size() == 2) {
1471          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1472       } else {
1473          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1474          nir_print_instr(&instr->instr, stderr);
1475          fprintf(stderr, "\n");
1476       }
1477       break;
1478    }
1479    case nir_op_ffract: {
1480       if (dst.size() == 1) {
1481          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1482       } else if (dst.size() == 2) {
1483          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1484       } else {
1485          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1486          nir_print_instr(&instr->instr, stderr);
1487          fprintf(stderr, "\n");
1488       }
1489       break;
1490    }
1491    case nir_op_ffloor: {
1492       if (dst.size() == 1) {
1493          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1494       } else if (dst.size() == 2) {
1495          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1496       } else {
1497          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1498          nir_print_instr(&instr->instr, stderr);
1499          fprintf(stderr, "\n");
1500       }
1501       break;
1502    }
1503    case nir_op_fceil: {
1504       if (dst.size() == 1) {
1505          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1506       } else if (dst.size() == 2) {
1507          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1508       } else {
1509          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1510          nir_print_instr(&instr->instr, stderr);
1511          fprintf(stderr, "\n");
1512       }
1513       break;
1514    }
1515    case nir_op_ftrunc: {
1516       if (dst.size() == 1) {
1517          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1518       } else if (dst.size() == 2) {
1519          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1520       } else {
1521          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1522          nir_print_instr(&instr->instr, stderr);
1523          fprintf(stderr, "\n");
1524       }
1525       break;
1526    }
1527    case nir_op_fround_even: {
1528       if (dst.size() == 1) {
1529          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1530       } else if (dst.size() == 2) {
1531          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1532       } else {
1533          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1534          nir_print_instr(&instr->instr, stderr);
1535          fprintf(stderr, "\n");
1536       }
1537       break;
1538    }
1539    case nir_op_fsin:
1540    case nir_op_fcos: {
1541       Temp src = get_alu_src(ctx, instr->src[0]);
1542       aco_ptr<Instruction> norm;
1543       if (dst.size() == 1) {
1544          Temp tmp;
1545          Operand half_pi(0x3e22f983u);
1546          if (src.type() == RegType::sgpr)
1547             tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1548          else
1549             tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1550
1551          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1552          if (ctx->options->chip_class < GFX9)
1553             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1554
1555          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1556          bld.vop1(opcode, Definition(dst), tmp);
1557       } else {
1558          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1559          nir_print_instr(&instr->instr, stderr);
1560          fprintf(stderr, "\n");
1561       }
1562       break;
1563    }
1564    case nir_op_ldexp: {
1565       if (dst.size() == 1) {
1566          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1567                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1568                   get_alu_src(ctx, instr->src[1]));
1569       } else if (dst.size() == 2) {
1570          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1571                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1572                   get_alu_src(ctx, instr->src[1]));
1573       } else {
1574          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1575          nir_print_instr(&instr->instr, stderr);
1576          fprintf(stderr, "\n");
1577       }
1578       break;
1579    }
1580    case nir_op_frexp_sig: {
1581       if (dst.size() == 1) {
1582          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1583                   get_alu_src(ctx, instr->src[0]));
1584       } else if (dst.size() == 2) {
1585          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1586                   get_alu_src(ctx, instr->src[0]));
1587       } else {
1588          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1589          nir_print_instr(&instr->instr, stderr);
1590          fprintf(stderr, "\n");
1591       }
1592       break;
1593    }
1594    case nir_op_frexp_exp: {
1595       if (instr->src[0].src.ssa->bit_size == 32) {
1596          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1597                   get_alu_src(ctx, instr->src[0]));
1598       } else if (instr->src[0].src.ssa->bit_size == 64) {
1599          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1600                   get_alu_src(ctx, instr->src[0]));
1601       } else {
1602          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1603          nir_print_instr(&instr->instr, stderr);
1604          fprintf(stderr, "\n");
1605       }
1606       break;
1607    }
1608    case nir_op_fsign: {
1609       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1610       if (dst.size() == 1) {
1611          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1612          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1613          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1614          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1615       } else if (dst.size() == 2) {
1616          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1617          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1618          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond);
1619
1620          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1621          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1622          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1623
1624          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1625       } else {
1626          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1627          nir_print_instr(&instr->instr, stderr);
1628          fprintf(stderr, "\n");
1629       }
1630       break;
1631    }
1632    case nir_op_f2f32: {
1633       if (instr->src[0].src.ssa->bit_size == 64) {
1634          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1635       } else {
1636          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1637          nir_print_instr(&instr->instr, stderr);
1638          fprintf(stderr, "\n");
1639       }
1640       break;
1641    }
1642    case nir_op_f2f64: {
1643       if (instr->src[0].src.ssa->bit_size == 32) {
1644          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1645       } else {
1646          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1647          nir_print_instr(&instr->instr, stderr);
1648          fprintf(stderr, "\n");
1649       }
1650       break;
1651    }
1652    case nir_op_i2f32: {
1653       assert(dst.size() == 1);
1654       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1655       break;
1656    }
1657    case nir_op_i2f64: {
1658       if (instr->src[0].src.ssa->bit_size == 32) {
1659          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1660       } else if (instr->src[0].src.ssa->bit_size == 64) {
1661          Temp src = get_alu_src(ctx, instr->src[0]);
1662          RegClass rc = RegClass(src.type(), 1);
1663          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1664          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1665          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1666          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1667          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1668          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1669
1670       } else {
1671          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1672          nir_print_instr(&instr->instr, stderr);
1673          fprintf(stderr, "\n");
1674       }
1675       break;
1676    }
1677    case nir_op_u2f32: {
1678       assert(dst.size() == 1);
1679       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1680       break;
1681    }
1682    case nir_op_u2f64: {
1683       if (instr->src[0].src.ssa->bit_size == 32) {
1684          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1685       } else if (instr->src[0].src.ssa->bit_size == 64) {
1686          Temp src = get_alu_src(ctx, instr->src[0]);
1687          RegClass rc = RegClass(src.type(), 1);
1688          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1689          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1690          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1691          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1692          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1693          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1694       } else {
1695          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1696          nir_print_instr(&instr->instr, stderr);
1697          fprintf(stderr, "\n");
1698       }
1699       break;
1700    }
1701    case nir_op_f2i32: {
1702       Temp src = get_alu_src(ctx, instr->src[0]);
1703       if (instr->src[0].src.ssa->bit_size == 32) {
1704          if (dst.type() == RegType::vgpr)
1705             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1706          else
1707             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1708                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1709
1710       } else if (instr->src[0].src.ssa->bit_size == 64) {
1711          if (dst.type() == RegType::vgpr)
1712             bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1713          else
1714             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1715                        bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1716
1717       } else {
1718          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1719          nir_print_instr(&instr->instr, stderr);
1720          fprintf(stderr, "\n");
1721       }
1722       break;
1723    }
1724    case nir_op_f2u32: {
1725       Temp src = get_alu_src(ctx, instr->src[0]);
1726       if (instr->src[0].src.ssa->bit_size == 32) {
1727          if (dst.type() == RegType::vgpr)
1728             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1729          else
1730             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1731                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1732
1733       } else if (instr->src[0].src.ssa->bit_size == 64) {
1734          if (dst.type() == RegType::vgpr)
1735             bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1736          else
1737             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1738                        bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1739
1740       } else {
1741          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1742          nir_print_instr(&instr->instr, stderr);
1743          fprintf(stderr, "\n");
1744       }
1745       break;
1746    }
1747    case nir_op_f2i64: {
1748       Temp src = get_alu_src(ctx, instr->src[0]);
1749       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1750          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1751          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1752          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1753          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1754          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1755          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1756          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1757          Temp new_exponent = bld.tmp(v1);
1758          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1759          mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1760          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1761          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1762          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1763          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1764          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1765          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1766          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1767          Temp new_lower = bld.tmp(v1);
1768          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1769          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1770          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1771
1772       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1773          if (src.type() == RegType::vgpr)
1774             src = bld.as_uniform(src);
1775          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1776          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1777          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1778          exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1779          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1780          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1781          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1782          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1783          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1784          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1785          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1786          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1787          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1788          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1789          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1790          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1791          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1792          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1793          Temp borrow = bld.tmp(s1);
1794          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1795          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1796          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1797
1798       } else if (instr->src[0].src.ssa->bit_size == 64) {
1799          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1800          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1801          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1802          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1803          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1804          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1805          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1806          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1807          if (dst.type() == RegType::sgpr) {
1808             lower = bld.as_uniform(lower);
1809             upper = bld.as_uniform(upper);
1810          }
1811          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1812
1813       } else {
1814          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1815          nir_print_instr(&instr->instr, stderr);
1816          fprintf(stderr, "\n");
1817       }
1818       break;
1819    }
1820    case nir_op_f2u64: {
1821       Temp src = get_alu_src(ctx, instr->src[0]);
1822       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1823          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1824          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1825          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1826          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1827          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1828          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1829          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1830          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1831          Temp new_exponent = bld.tmp(v1);
1832          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1833          mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1834          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1835          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1836          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1837          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1838          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1839          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1840          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1841
1842       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1843          if (src.type() == RegType::vgpr)
1844             src = bld.as_uniform(src);
1845          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1846          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1847          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1848          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1849          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1850          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1851          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1852          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1853          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1854          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1855          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1856          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1857          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1858          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1859          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1860          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1861          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1862          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1863
1864       } else if (instr->src[0].src.ssa->bit_size == 64) {
1865          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1866          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1867          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1868          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1869          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1870          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1871          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1872          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1873          if (dst.type() == RegType::sgpr) {
1874             lower = bld.as_uniform(lower);
1875             upper = bld.as_uniform(upper);
1876          }
1877          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1878
1879       } else {
1880          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1881          nir_print_instr(&instr->instr, stderr);
1882          fprintf(stderr, "\n");
1883       }
1884       break;
1885    }
1886    case nir_op_b2f32: {
1887       Temp src = get_alu_src(ctx, instr->src[0]);
1888       if (dst.regClass() == s1) {
1889          src = as_uniform_bool(ctx, src);
1890          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1891       } else if (dst.regClass() == v1) {
1892          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
1893                       as_divergent_bool(ctx, src, true));
1894       } else {
1895          unreachable("Wrong destination register class for nir_op_b2f32.");
1896       }
1897       break;
1898    }
1899    case nir_op_b2f64: {
1900       Temp src = get_alu_src(ctx, instr->src[0]);
1901       if (dst.regClass() == s2) {
1902          src = as_uniform_bool(ctx, src);
1903          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1904       } else if (dst.regClass() == v2) {
1905          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1906          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
1907                       as_divergent_bool(ctx, src, true));
1908          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1909       } else {
1910          unreachable("Wrong destination register class for nir_op_b2f64.");
1911       }
1912       break;
1913    }
1914    case nir_op_i2i32: {
1915       Temp src = get_alu_src(ctx, instr->src[0]);
1916       if (instr->src[0].src.ssa->bit_size == 64) {
1917          /* we can actually just say dst = src, as it would map the lower register */
1918          emit_extract_vector(ctx, src, 0, dst);
1919       } else {
1920          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1921          nir_print_instr(&instr->instr, stderr);
1922          fprintf(stderr, "\n");
1923       }
1924       break;
1925    }
1926    case nir_op_u2u32: {
1927       Temp src = get_alu_src(ctx, instr->src[0]);
1928       if (instr->src[0].src.ssa->bit_size == 16) {
1929          if (dst.regClass() == s1) {
1930             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
1931          } else {
1932             // TODO: do better with SDWA
1933             bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
1934          }
1935       } else if (instr->src[0].src.ssa->bit_size == 64) {
1936          /* we can actually just say dst = src, as it would map the lower register */
1937          emit_extract_vector(ctx, src, 0, dst);
1938       } else {
1939          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1940          nir_print_instr(&instr->instr, stderr);
1941          fprintf(stderr, "\n");
1942       }
1943       break;
1944    }
1945    case nir_op_i2i64: {
1946       Temp src = get_alu_src(ctx, instr->src[0]);
1947       if (instr->src[0].src.ssa->bit_size == 32) {
1948          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1949       } else {
1950          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1951          nir_print_instr(&instr->instr, stderr);
1952          fprintf(stderr, "\n");
1953       }
1954       break;
1955    }
1956    case nir_op_u2u64: {
1957       Temp src = get_alu_src(ctx, instr->src[0]);
1958       if (instr->src[0].src.ssa->bit_size == 32) {
1959          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1960       } else {
1961          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1962          nir_print_instr(&instr->instr, stderr);
1963          fprintf(stderr, "\n");
1964       }
1965       break;
1966    }
1967    case nir_op_b2i32: {
1968       Temp src = get_alu_src(ctx, instr->src[0]);
1969       if (dst.regClass() == s1) {
1970          if (src.regClass() == s1) {
1971             bld.copy(Definition(dst), src);
1972          } else {
1973             // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
1974             assert(src.regClass() == s2);
1975             bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
1976          }
1977       } else {
1978          assert(dst.regClass() == v1 && src.regClass() == s2);
1979          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
1980       }
1981       break;
1982    }
1983    case nir_op_i2b1: {
1984       Temp src = get_alu_src(ctx, instr->src[0]);
1985       if (dst.regClass() == s2) {
1986          assert(src.regClass() == v1 || src.regClass() == v2);
1987          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
1988                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
1989       } else {
1990          assert(src.regClass() == s1 && dst.regClass() == s1);
1991          bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
1992       }
1993       break;
1994    }
1995    case nir_op_pack_64_2x32_split: {
1996       Temp src0 = get_alu_src(ctx, instr->src[0]);
1997       Temp src1 = get_alu_src(ctx, instr->src[1]);
1998
1999       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2000       break;
2001    }
2002    case nir_op_unpack_64_2x32_split_x:
2003       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2004       break;
2005    case nir_op_unpack_64_2x32_split_y:
2006       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2007       break;
2008    case nir_op_pack_half_2x16: {
2009       Temp src = get_alu_src(ctx, instr->src[0], 2);
2010
2011       if (dst.regClass() == v1) {
2012          Temp src0 = bld.tmp(v1);
2013          Temp src1 = bld.tmp(v1);
2014          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2015          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2016
2017       } else {
2018          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2019          nir_print_instr(&instr->instr, stderr);
2020          fprintf(stderr, "\n");
2021       }
2022       break;
2023    }
2024    case nir_op_unpack_half_2x16_split_x: {
2025       if (dst.regClass() == v1) {
2026          Builder bld(ctx->program, ctx->block);
2027          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2028       } else {
2029          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2030          nir_print_instr(&instr->instr, stderr);
2031          fprintf(stderr, "\n");
2032       }
2033       break;
2034    }
2035    case nir_op_unpack_half_2x16_split_y: {
2036       if (dst.regClass() == v1) {
2037          Builder bld(ctx->program, ctx->block);
2038          /* TODO: use SDWA here */
2039          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2040                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2041       } else {
2042          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2043          nir_print_instr(&instr->instr, stderr);
2044          fprintf(stderr, "\n");
2045       }
2046       break;
2047    }
2048    case nir_op_fquantize2f16: {
2049       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
2050
2051       Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2052
2053       Temp cmp_res = bld.tmp(s2);
2054       bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2055
2056       Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2057
2058       bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2059       break;
2060    }
2061    case nir_op_bfm: {
2062       Temp bits = get_alu_src(ctx, instr->src[0]);
2063       Temp offset = get_alu_src(ctx, instr->src[1]);
2064
2065       if (dst.regClass() == s1) {
2066          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2067       } else if (dst.regClass() == v1) {
2068          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2069       } else {
2070          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2071          nir_print_instr(&instr->instr, stderr);
2072          fprintf(stderr, "\n");
2073       }
2074       break;
2075    }
2076    case nir_op_bitfield_select: {
2077       /* (mask & insert) | (~mask & base) */
2078       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2079       Temp insert = get_alu_src(ctx, instr->src[1]);
2080       Temp base = get_alu_src(ctx, instr->src[2]);
2081
2082       /* dst = (insert & bitmask) | (base & ~bitmask) */
2083       if (dst.regClass() == s1) {
2084          aco_ptr<Instruction> sop2;
2085          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2086          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2087          Operand lhs;
2088          if (const_insert && const_bitmask) {
2089             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2090          } else {
2091             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2092             lhs = Operand(insert);
2093          }
2094
2095          Operand rhs;
2096          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2097          if (const_base && const_bitmask) {
2098             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2099          } else {
2100             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2101             rhs = Operand(base);
2102          }
2103
2104          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2105
2106       } else if (dst.regClass() == v1) {
2107          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2108             base = as_vgpr(ctx, base);
2109          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2110             insert = as_vgpr(ctx, insert);
2111
2112          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2113
2114       } else {
2115          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2116          nir_print_instr(&instr->instr, stderr);
2117          fprintf(stderr, "\n");
2118       }
2119       break;
2120    }
2121    case nir_op_ubfe:
2122    case nir_op_ibfe: {
2123       Temp base = get_alu_src(ctx, instr->src[0]);
2124       Temp offset = get_alu_src(ctx, instr->src[1]);
2125       Temp bits = get_alu_src(ctx, instr->src[2]);
2126
2127       if (dst.type() == RegType::sgpr) {
2128          Operand extract;
2129          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2130          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2131          if (const_offset && const_bits) {
2132             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2133             extract = Operand(const_extract);
2134          } else {
2135             Operand width;
2136             if (const_bits) {
2137                width = Operand(const_bits->u32 << 16);
2138             } else {
2139                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2140             }
2141             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2142          }
2143
2144          aco_opcode opcode;
2145          if (dst.regClass() == s1) {
2146             if (instr->op == nir_op_ubfe)
2147                opcode = aco_opcode::s_bfe_u32;
2148             else
2149                opcode = aco_opcode::s_bfe_i32;
2150          } else if (dst.regClass() == s2) {
2151             if (instr->op == nir_op_ubfe)
2152                opcode = aco_opcode::s_bfe_u64;
2153             else
2154                opcode = aco_opcode::s_bfe_i64;
2155          } else {
2156             unreachable("Unsupported BFE bit size");
2157          }
2158
2159          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2160
2161       } else {
2162          aco_opcode opcode;
2163          if (dst.regClass() == v1) {
2164             if (instr->op == nir_op_ubfe)
2165                opcode = aco_opcode::v_bfe_u32;
2166             else
2167                opcode = aco_opcode::v_bfe_i32;
2168          } else {
2169             unreachable("Unsupported BFE bit size");
2170          }
2171
2172          emit_vop3a_instruction(ctx, instr, opcode, dst);
2173       }
2174       break;
2175    }
2176    case nir_op_bit_count: {
2177       Temp src = get_alu_src(ctx, instr->src[0]);
2178       if (src.regClass() == s1) {
2179          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2180       } else if (src.regClass() == v1) {
2181          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2182       } else if (src.regClass() == v2) {
2183          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2184                   emit_extract_vector(ctx, src, 1, v1),
2185                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2186                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2187       } else if (src.regClass() == s2) {
2188          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2189       } else {
2190          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2191          nir_print_instr(&instr->instr, stderr);
2192          fprintf(stderr, "\n");
2193       }
2194       break;
2195    }
2196    case nir_op_flt: {
2197       if (instr->src[0].src.ssa->bit_size == 32)
2198          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
2199       else if (instr->src[0].src.ssa->bit_size == 64)
2200          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
2201       break;
2202    }
2203    case nir_op_fge: {
2204       if (instr->src[0].src.ssa->bit_size == 32)
2205          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
2206       else if (instr->src[0].src.ssa->bit_size == 64)
2207          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
2208       break;
2209    }
2210    case nir_op_feq: {
2211       if (instr->src[0].src.ssa->bit_size == 32)
2212          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
2213       else if (instr->src[0].src.ssa->bit_size == 64)
2214          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
2215       break;
2216    }
2217    case nir_op_fne: {
2218       if (instr->src[0].src.ssa->bit_size == 32)
2219          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
2220       else if (instr->src[0].src.ssa->bit_size == 64)
2221          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
2222       break;
2223    }
2224    case nir_op_ilt: {
2225       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2226          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
2227       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2228          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
2229       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2230          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
2231       break;
2232    }
2233    case nir_op_ige: {
2234       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2235          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
2236       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2237          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
2238       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2239          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
2240       break;
2241    }
2242    case nir_op_ieq: {
2243       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2244          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
2245       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2246          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
2247       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2248          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
2249       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2250          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
2251       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2252          Temp src0 = get_alu_src(ctx, instr->src[0]);
2253          Temp src1 = get_alu_src(ctx, instr->src[1]);
2254          bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
2255                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2256       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2257          Temp src0 = get_alu_src(ctx, instr->src[0]);
2258          Temp src1 = get_alu_src(ctx, instr->src[1]);
2259          bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
2260                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2261       } else {
2262          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2263          nir_print_instr(&instr->instr, stderr);
2264          fprintf(stderr, "\n");
2265       }
2266       break;
2267    }
2268    case nir_op_ine: {
2269       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2270          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
2271       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2272          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
2273       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2274          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
2275       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2276          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
2277       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2278          Temp src0 = get_alu_src(ctx, instr->src[0]);
2279          Temp src1 = get_alu_src(ctx, instr->src[1]);
2280          bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
2281                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2282       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2283          Temp src0 = get_alu_src(ctx, instr->src[0]);
2284          Temp src1 = get_alu_src(ctx, instr->src[1]);
2285          bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
2286                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2287       } else {
2288          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2289          nir_print_instr(&instr->instr, stderr);
2290          fprintf(stderr, "\n");
2291       }
2292       break;
2293    }
2294    case nir_op_ult: {
2295       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2296          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
2297       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2298          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
2299       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2300          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
2301       break;
2302    }
2303    case nir_op_uge: {
2304       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2305          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
2306       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2307          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
2308       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2309          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
2310       break;
2311    }
2312    case nir_op_fddx:
2313    case nir_op_fddy:
2314    case nir_op_fddx_fine:
2315    case nir_op_fddy_fine:
2316    case nir_op_fddx_coarse:
2317    case nir_op_fddy_coarse: {
2318       Definition tl = bld.def(v1);
2319       uint16_t dpp_ctrl;
2320       if (instr->op == nir_op_fddx_fine) {
2321          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2322          dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2323       } else if (instr->op == nir_op_fddy_fine) {
2324          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2325          dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2326       } else {
2327          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2328          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2329             dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2330          else
2331             dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2332       }
2333
2334       Definition tmp = bld.def(v1);
2335       bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2336       emit_wqm(ctx, tmp.getTemp(), dst, true);
2337       break;
2338    }
2339    default:
2340       fprintf(stderr, "Unknown NIR ALU instr: ");
2341       nir_print_instr(&instr->instr, stderr);
2342       fprintf(stderr, "\n");
2343    }
2344 }
2345
2346 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2347 {
2348    Temp dst = get_ssa_temp(ctx, &instr->def);
2349
2350    // TODO: we really want to have the resulting type as this would allow for 64bit literals
2351    // which get truncated the lsb if double and msb if int
2352    // for now, we only use s_mov_b64 with 64bit inline constants
2353    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2354    assert(dst.type() == RegType::sgpr);
2355
2356    if (dst.size() == 1)
2357    {
2358       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
2359    } else {
2360       assert(dst.size() != 1);
2361       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2362       if (instr->def.bit_size == 64)
2363          for (unsigned i = 0; i < dst.size(); i++)
2364             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2365       else {
2366          for (unsigned i = 0; i < dst.size(); i++)
2367             vec->operands[i] = Operand{instr->value[i].u32};
2368       }
2369       vec->definitions[0] = Definition(dst);
2370       ctx->block->instructions.emplace_back(std::move(vec));
2371    }
2372 }
2373
2374 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2375 {
2376    uint32_t new_mask = 0;
2377    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2378       if (mask & (1u << i))
2379          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2380    return new_mask;
2381 }
2382
2383 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2384 {
2385    /* This wouldn't work inside control flow or with indirect offsets but
2386     * that doesn't happen because of nir_lower_io_to_temporaries(). */
2387
2388    unsigned write_mask = nir_intrinsic_write_mask(instr);
2389    unsigned component = nir_intrinsic_component(instr);
2390    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2391    unsigned idx = nir_intrinsic_base(instr) + component;
2392
2393    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2394    if (off_instr->type != nir_instr_type_load_const) {
2395       fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2396       nir_print_instr(off_instr, stderr);
2397       fprintf(stderr, "\n");
2398    }
2399    idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2400
2401    if (instr->src[0].ssa->bit_size == 64)
2402       write_mask = widen_mask(write_mask, 2);
2403
2404    for (unsigned i = 0; i < 8; ++i) {
2405       if (write_mask & (1 << i)) {
2406          ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2407          ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2408       }
2409       idx++;
2410    }
2411 }
2412
2413 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2414 {
2415    unsigned write_mask = nir_intrinsic_write_mask(instr);
2416    Operand values[4];
2417    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2418    for (unsigned i = 0; i < 4; ++i) {
2419       if (write_mask & (1 << i)) {
2420          Temp tmp = emit_extract_vector(ctx, src, i, v1);
2421          values[i] = Operand(tmp);
2422       } else {
2423          values[i] = Operand(v1);
2424       }
2425    }
2426
2427    unsigned index = nir_intrinsic_base(instr) / 4;
2428    unsigned target, col_format;
2429    unsigned enabled_channels = 0xF;
2430    aco_opcode compr_op = (aco_opcode)0;
2431
2432    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2433    assert(offset && "Non-const offsets on exports not yet supported");
2434    index += offset->u32;
2435
2436    assert(index != FRAG_RESULT_COLOR);
2437
2438    /* Unlike vertex shader exports, it's fine to use multiple exports to
2439     * export separate channels of one target. So shaders which export both
2440     * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2441     * TODO: combine the exports in those cases and create better code
2442     */
2443
2444    if (index == FRAG_RESULT_SAMPLE_MASK) {
2445
2446       if (ctx->program->info->ps.writes_z) {
2447          target = V_008DFC_SQ_EXP_MRTZ;
2448          enabled_channels = 0x4;
2449          col_format = (unsigned) -1;
2450
2451          values[2] = values[0];
2452          values[0] = Operand(v1);
2453       } else {
2454          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2455          exp->valid_mask = false;
2456          exp->done = false;
2457          exp->compressed = true;
2458          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2459          exp->enabled_mask = 0xc;
2460          for (int i = 0; i < 4; i++)
2461             exp->operands[i] = Operand(v1);
2462          exp->operands[1] = Operand(values[0]);
2463          ctx->block->instructions.emplace_back(std::move(exp));
2464          return;
2465       }
2466
2467    } else if (index == FRAG_RESULT_DEPTH) {
2468
2469       target = V_008DFC_SQ_EXP_MRTZ;
2470       enabled_channels = 0x1;
2471       col_format = (unsigned) -1;
2472
2473    } else if (index == FRAG_RESULT_STENCIL) {
2474
2475       if (ctx->program->info->ps.writes_z) {
2476          target = V_008DFC_SQ_EXP_MRTZ;
2477          enabled_channels = 0x2;
2478          col_format = (unsigned) -1;
2479
2480          values[1] = values[0];
2481          values[0] = Operand(v1);
2482       } else {
2483          aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2484          shift->operands[0] = Operand((uint32_t) 16);
2485          shift->operands[1] = values[0];
2486          Temp tmp = {ctx->program->allocateId(), v1};
2487          shift->definitions[0] = Definition(tmp);
2488          ctx->block->instructions.emplace_back(std::move(shift));
2489
2490          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2491          exp->valid_mask = false;
2492          exp->done = false;
2493          exp->compressed = true;
2494          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2495          exp->enabled_mask = 0x3;
2496          exp->operands[0] = Operand(tmp);
2497          for (int i = 1; i < 4; i++)
2498             exp->operands[i] = Operand(v1);
2499          ctx->block->instructions.emplace_back(std::move(exp));
2500          return;
2501       }
2502
2503    } else {
2504       index -= FRAG_RESULT_DATA0;
2505       target = V_008DFC_SQ_EXP_MRT + index;
2506       col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2507    }
2508    ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2509    ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2510    assert(!is_int8 && !is_int10);
2511
2512    switch (col_format)
2513    {
2514    case V_028714_SPI_SHADER_ZERO:
2515       enabled_channels = 0; /* writemask */
2516       target = V_008DFC_SQ_EXP_NULL;
2517       break;
2518
2519    case V_028714_SPI_SHADER_32_R:
2520       enabled_channels = 1;
2521       break;
2522
2523    case V_028714_SPI_SHADER_32_GR:
2524       enabled_channels = 0x3;
2525       break;
2526
2527    case V_028714_SPI_SHADER_32_AR:
2528       if (ctx->options->chip_class >= GFX10) {
2529          /* Special case: on GFX10, the outputs are different for 32_AR */
2530          enabled_channels = 0x3;
2531          values[1] = values[3];
2532       } else {
2533          enabled_channels = 0x9;
2534       }
2535       break;
2536
2537    case V_028714_SPI_SHADER_FP16_ABGR:
2538       enabled_channels = 0x5;
2539       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2540       break;
2541
2542    case V_028714_SPI_SHADER_UNORM16_ABGR:
2543       enabled_channels = 0x5;
2544       compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2545       break;
2546
2547    case V_028714_SPI_SHADER_SNORM16_ABGR:
2548       enabled_channels = 0x5;
2549       compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2550       break;
2551
2552    case V_028714_SPI_SHADER_UINT16_ABGR:
2553       enabled_channels = 0x5;
2554       compr_op = aco_opcode::v_cvt_pk_u16_u32;
2555       break;
2556
2557    case V_028714_SPI_SHADER_SINT16_ABGR:
2558       enabled_channels = 0x5;
2559       compr_op = aco_opcode::v_cvt_pk_i16_i32;
2560       break;
2561
2562    case V_028714_SPI_SHADER_32_ABGR:
2563       enabled_channels = 0xF;
2564       break;
2565
2566    default:
2567       break;
2568    }
2569
2570    if (target == V_008DFC_SQ_EXP_NULL)
2571       return;
2572
2573    if ((bool)compr_op)
2574    {
2575       for (int i = 0; i < 2; i++)
2576       {
2577          /* check if at least one of the values to be compressed is enabled */
2578          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2579          if (enabled) {
2580             enabled_channels |= enabled << (i*2);
2581             aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2582             Temp tmp{ctx->program->allocateId(), v1};
2583             compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2584             compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2585             compr->definitions[0] = Definition(tmp);
2586             values[i] = Operand(tmp);
2587             ctx->block->instructions.emplace_back(std::move(compr));
2588          } else {
2589             values[i] = Operand(v1);
2590          }
2591       }
2592    }
2593
2594    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2595    exp->valid_mask = false;
2596    exp->done = false;
2597    exp->compressed = (bool) compr_op;
2598    exp->dest = target;
2599    exp->enabled_mask = enabled_channels;
2600    if ((bool) compr_op) {
2601       for (int i = 0; i < 2; i++)
2602          exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2603       exp->operands[2] = Operand(v1);
2604       exp->operands[3] = Operand(v1);
2605    } else {
2606       for (int i = 0; i < 4; i++)
2607          exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2608    }
2609
2610    ctx->block->instructions.emplace_back(std::move(exp));
2611 }
2612
2613 Operand load_lds_size_m0(isel_context *ctx)
2614 {
2615    /* TODO: m0 does not need to be initialized on GFX9+ */
2616    Builder bld(ctx->program, ctx->block);
2617    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
2618 }
2619
2620 void load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
2621               Temp address, unsigned base_offset, unsigned align)
2622 {
2623    assert(util_is_power_of_two_nonzero(align) && align >= 4);
2624
2625    Builder bld(ctx->program, ctx->block);
2626
2627    Operand m = load_lds_size_m0(ctx);
2628
2629    unsigned num_components = dst.size() * 4u / elem_size_bytes;
2630    unsigned bytes_read = 0;
2631    unsigned result_size = 0;
2632    unsigned total_bytes = num_components * elem_size_bytes;
2633    std::array<Temp, 4> result;
2634
2635    while (bytes_read < total_bytes) {
2636       unsigned todo = total_bytes - bytes_read;
2637       bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
2638       bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
2639
2640       aco_opcode op = aco_opcode::last_opcode;
2641       bool read2 = false;
2642       if (todo >= 16 && aligned16) {
2643          op = aco_opcode::ds_read_b128;
2644          todo = 16;
2645       } else if (todo >= 16 && aligned8) {
2646          op = aco_opcode::ds_read2_b64;
2647          read2 = true;
2648          todo = 16;
2649       } else if (todo >= 12 && aligned16) {
2650          op = aco_opcode::ds_read_b96;
2651          todo = 12;
2652       } else if (todo >= 8 && aligned8) {
2653          op = aco_opcode::ds_read_b64;
2654          todo = 8;
2655       } else if (todo >= 8) {
2656          op = aco_opcode::ds_read2_b32;
2657          read2 = true;
2658          todo = 8;
2659       } else if (todo >= 4) {
2660          op = aco_opcode::ds_read_b32;
2661          todo = 4;
2662       } else {
2663          assert(false);
2664       }
2665       assert(todo % elem_size_bytes == 0);
2666       unsigned num_elements = todo / elem_size_bytes;
2667       unsigned offset = base_offset + bytes_read;
2668       unsigned max_offset = read2 ? 1019 : 65535;
2669
2670       Temp address_offset = address;
2671       if (offset > max_offset) {
2672          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2673          offset = bytes_read;
2674       }
2675       assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
2676
2677       Temp res;
2678       if (num_components == 1 && dst.type() == RegType::vgpr)
2679          res = dst;
2680       else
2681          res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
2682
2683       if (read2)
2684          res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
2685       else
2686          res = bld.ds(op, Definition(res), address_offset, m, offset);
2687
2688       if (num_components == 1) {
2689          assert(todo == total_bytes);
2690          if (dst.type() == RegType::sgpr)
2691             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
2692          return;
2693       }
2694
2695       if (dst.type() == RegType::sgpr)
2696          res = bld.as_uniform(res);
2697
2698       if (num_elements == 1) {
2699          result[result_size++] = res;
2700       } else {
2701          assert(res != dst && res.size() % num_elements == 0);
2702          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
2703          split->operands[0] = Operand(res);
2704          for (unsigned i = 0; i < num_elements; i++)
2705             split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
2706          ctx->block->instructions.emplace_back(std::move(split));
2707       }
2708
2709       bytes_read += todo;
2710    }
2711
2712    assert(result_size == num_components && result_size > 1);
2713    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
2714    for (unsigned i = 0; i < result_size; i++)
2715       vec->operands[i] = Operand(result[i]);
2716    vec->definitions[0] = Definition(dst);
2717    ctx->block->instructions.emplace_back(std::move(vec));
2718    ctx->allocated_vec.emplace(dst.id(), result);
2719 }
2720
2721 Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
2722 {
2723    if (start == 0 && size == data.size())
2724       return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
2725
2726    unsigned size_hint = 1;
2727    auto it = ctx->allocated_vec.find(data.id());
2728    if (it != ctx->allocated_vec.end())
2729       size_hint = it->second[0].size();
2730    if (size % size_hint || start % size_hint)
2731       size_hint = 1;
2732
2733    start /= size_hint;
2734    size /= size_hint;
2735
2736    Temp elems[size];
2737    for (unsigned i = 0; i < size; i++)
2738       elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
2739
2740    if (size == 1)
2741       return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
2742
2743    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
2744    for (unsigned i = 0; i < size; i++)
2745       vec->operands[i] = Operand(elems[i]);
2746    Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
2747    vec->definitions[0] = Definition(res);
2748    ctx->block->instructions.emplace_back(std::move(vec));
2749    return res;
2750 }
2751
2752 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
2753 {
2754    Builder bld(ctx->program, ctx->block);
2755    unsigned bytes_written = 0;
2756    while (bytes_written < total_size * 4) {
2757       unsigned todo = total_size * 4 - bytes_written;
2758       bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
2759       bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
2760
2761       aco_opcode op = aco_opcode::last_opcode;
2762       bool write2 = false;
2763       unsigned size = 0;
2764       if (todo >= 16 && aligned16) {
2765          op = aco_opcode::ds_write_b128;
2766          size = 4;
2767       } else if (todo >= 16 && aligned8) {
2768          op = aco_opcode::ds_write2_b64;
2769          write2 = true;
2770          size = 4;
2771       } else if (todo >= 12 && aligned16) {
2772          op = aco_opcode::ds_write_b96;
2773          size = 3;
2774       } else if (todo >= 8 && aligned8) {
2775          op = aco_opcode::ds_write_b64;
2776          size = 2;
2777       } else if (todo >= 8) {
2778          op = aco_opcode::ds_write2_b32;
2779          write2 = true;
2780          size = 2;
2781       } else if (todo >= 4) {
2782          op = aco_opcode::ds_write_b32;
2783          size = 1;
2784       } else {
2785          assert(false);
2786       }
2787
2788       unsigned offset = offset0 + offset1 + bytes_written;
2789       unsigned max_offset = write2 ? 1020 : 65535;
2790       Temp address_offset = address;
2791       if (offset > max_offset) {
2792          address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
2793          offset = offset1 + bytes_written;
2794       }
2795       assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
2796
2797       if (write2) {
2798          Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
2799          Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
2800          bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
2801       } else {
2802          Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
2803          bld.ds(op, address_offset, val, m, offset);
2804       }
2805
2806       bytes_written += size * 4;
2807    }
2808 }
2809
2810 void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
2811                Temp address, unsigned base_offset, unsigned align)
2812 {
2813    assert(util_is_power_of_two_nonzero(align) && align >= 4);
2814
2815    Operand m = load_lds_size_m0(ctx);
2816
2817    /* we need at most two stores for 32bit variables */
2818    int start[2], count[2];
2819    u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
2820    u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
2821    assert(wrmask == 0);
2822
2823    /* one combined store is sufficient */
2824    if (count[0] == count[1]) {
2825       Builder bld(ctx->program, ctx->block);
2826
2827       Temp address_offset = address;
2828       if ((base_offset >> 2) + start[1] > 255) {
2829          address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
2830          base_offset = 0;
2831       }
2832
2833       assert(count[0] == 1);
2834       Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
2835       Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
2836       aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
2837       base_offset = base_offset / elem_size_bytes;
2838       bld.ds(op, address_offset, val0, val1, m,
2839              base_offset + start[0], base_offset + start[1]);
2840       return;
2841    }
2842
2843    for (unsigned i = 0; i < 2; i++) {
2844       if (count[i] == 0)
2845          continue;
2846
2847       unsigned elem_size_words = elem_size_bytes / 4;
2848       ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
2849                       base_offset, start[i] * elem_size_bytes, align);
2850    }
2851    return;
2852 }
2853
2854 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2855 {
2856    if (ctx->stage == vertex_vs) {
2857       visit_store_vs_output(ctx, instr);
2858    } else if (ctx->stage == fragment_fs) {
2859       visit_store_fs_output(ctx, instr);
2860    } else {
2861       unreachable("Shader stage not implemented");
2862    }
2863 }
2864
2865 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2866 {
2867    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2868    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2869
2870    Builder bld(ctx->program, ctx->block);
2871    Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2872    bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2873 }
2874
2875 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2876 {
2877    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2878    for (unsigned i = 0; i < num_components; i++)
2879       vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
2880
2881    if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
2882       assert(num_components == 4);
2883       Builder bld(ctx->program, ctx->block);
2884       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
2885    }
2886
2887    for (Operand& op : vec->operands)
2888       op = op.isUndefined() ? Operand(0u) : op;
2889
2890    vec->definitions[0] = Definition(dst);
2891    ctx->block->instructions.emplace_back(std::move(vec));
2892    emit_split_vector(ctx, dst, num_components);
2893    return;
2894 }
2895
2896 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2897 {
2898    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2899    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2900    unsigned idx = nir_intrinsic_base(instr);
2901    unsigned component = nir_intrinsic_component(instr);
2902    Temp prim_mask = ctx->prim_mask;
2903
2904    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2905    if (offset) {
2906       assert(offset->u32 == 0);
2907    } else {
2908       /* the lower 15bit of the prim_mask contain the offset into LDS
2909        * while the upper bits contain the number of prims */
2910       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2911       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2912       Builder bld(ctx->program, ctx->block);
2913       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2914       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2915       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2916       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2917       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2918    }
2919
2920    if (instr->dest.ssa.num_components == 1) {
2921       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2922    } else {
2923       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2924       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2925       {
2926          Temp tmp = {ctx->program->allocateId(), v1};
2927          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2928          vec->operands[i] = Operand(tmp);
2929       }
2930       vec->definitions[0] = Definition(dst);
2931       ctx->block->instructions.emplace_back(std::move(vec));
2932    }
2933 }
2934
2935 unsigned get_num_channels_from_data_format(unsigned data_format)
2936 {
2937    switch (data_format) {
2938    case V_008F0C_BUF_DATA_FORMAT_8:
2939    case V_008F0C_BUF_DATA_FORMAT_16:
2940    case V_008F0C_BUF_DATA_FORMAT_32:
2941       return 1;
2942    case V_008F0C_BUF_DATA_FORMAT_8_8:
2943    case V_008F0C_BUF_DATA_FORMAT_16_16:
2944    case V_008F0C_BUF_DATA_FORMAT_32_32:
2945       return 2;
2946    case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2947    case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2948    case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2949       return 3;
2950    case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2951    case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2952    case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2953    case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2954    case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2955       return 4;
2956    default:
2957       break;
2958    }
2959
2960    return 4;
2961 }
2962
2963 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2964  * so we may need to fix it up. */
2965 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
2966 {
2967    Builder bld(ctx->program, ctx->block);
2968
2969    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
2970       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
2971
2972    /* For the integer-like cases, do a natural sign extension.
2973     *
2974     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2975     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2976     * exponent.
2977     */
2978    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
2979    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
2980
2981    /* Convert back to the right type. */
2982    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
2983       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2984       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
2985       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
2986    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
2987       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2988    }
2989
2990    return alpha;
2991 }
2992
2993 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
2994 {
2995    Builder bld(ctx->program, ctx->block);
2996    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2997    if (ctx->stage & sw_vs) {
2998
2999       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3000       if (off_instr->type != nir_instr_type_load_const) {
3001          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3002          nir_print_instr(off_instr, stderr);
3003          fprintf(stderr, "\n");
3004       }
3005       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
3006
3007       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
3008
3009       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
3010       unsigned component = nir_intrinsic_component(instr);
3011       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
3012       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
3013       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
3014       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
3015
3016       unsigned dfmt = attrib_format & 0xf;
3017
3018       unsigned nfmt = (attrib_format >> 4) & 0x7;
3019       unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
3020       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
3021       unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
3022       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
3023       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
3024       if (post_shuffle)
3025          num_channels = MAX2(num_channels, 3);
3026
3027       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
3028
3029       Temp index;
3030       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
3031          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
3032          if (divisor) {
3033             ctx->needs_instance_id = true;
3034
3035             if (divisor != 1) {
3036                Temp divided = bld.tmp(v1);
3037                emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
3038                index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
3039             } else {
3040                index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
3041             }
3042          } else {
3043             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
3044          }
3045       } else {
3046          index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
3047       }
3048
3049       if (attrib_stride != 0 && attrib_offset > attrib_stride) {
3050          index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
3051          attrib_offset = attrib_offset % attrib_stride;
3052       }
3053
3054       Operand soffset(0u);
3055       if (attrib_offset >= 4096) {
3056          soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
3057          attrib_offset = 0;
3058       }
3059
3060       aco_opcode opcode;
3061       switch (num_channels) {
3062       case 1:
3063          opcode = aco_opcode::tbuffer_load_format_x;
3064          break;
3065       case 2:
3066          opcode = aco_opcode::tbuffer_load_format_xy;
3067          break;
3068       case 3:
3069          opcode = aco_opcode::tbuffer_load_format_xyz;
3070          break;
3071       case 4:
3072          opcode = aco_opcode::tbuffer_load_format_xyzw;
3073          break;
3074       default:
3075          unreachable("Unimplemented load_input vector size");
3076       }
3077
3078       Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
3079
3080       aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
3081       mubuf->operands[0] = Operand(index);
3082       mubuf->operands[1] = Operand(list);
3083       mubuf->operands[2] = soffset;
3084       mubuf->definitions[0] = Definition(tmp);
3085       mubuf->idxen = true;
3086       mubuf->can_reorder = true;
3087       mubuf->dfmt = dfmt;
3088       mubuf->nfmt = nfmt;
3089       assert(attrib_offset < 4096);
3090       mubuf->offset = attrib_offset;
3091       ctx->block->instructions.emplace_back(std::move(mubuf));
3092
3093       emit_split_vector(ctx, tmp, tmp.size());
3094
3095       if (tmp.id() != dst.id()) {
3096          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
3097                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
3098
3099          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
3100          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
3101          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
3102
3103          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3104          for (unsigned i = 0; i < dst.size(); i++) {
3105             unsigned idx = i + component;
3106             if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
3107                Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
3108                vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
3109             } else if (idx < num_channels) {
3110                vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
3111             } else if (is_float && idx == 3) {
3112                vec->operands[i] = Operand(0x3f800000u);
3113             } else if (!is_float && idx == 3) {
3114                vec->operands[i] = Operand(1u);
3115             } else {
3116                vec->operands[i] = Operand(0u);
3117             }
3118          }
3119          vec->definitions[0] = Definition(dst);
3120          ctx->block->instructions.emplace_back(std::move(vec));
3121          emit_split_vector(ctx, dst, dst.size());
3122       }
3123
3124    } else if (ctx->stage == fragment_fs) {
3125       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
3126       if (off_instr->type != nir_instr_type_load_const ||
3127           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
3128          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
3129          nir_print_instr(off_instr, stderr);
3130          fprintf(stderr, "\n");
3131       }
3132
3133       Temp prim_mask = ctx->prim_mask;
3134       nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
3135       if (offset) {
3136          assert(offset->u32 == 0);
3137       } else {
3138          /* the lower 15bit of the prim_mask contain the offset into LDS
3139           * while the upper bits contain the number of prims */
3140          Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
3141          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
3142          Builder bld(ctx->program, ctx->block);
3143          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
3144          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
3145          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
3146          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
3147          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
3148       }
3149
3150       unsigned idx = nir_intrinsic_base(instr);
3151       unsigned component = nir_intrinsic_component(instr);
3152
3153       if (dst.size() == 1) {
3154          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
3155       } else {
3156          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
3157          for (unsigned i = 0; i < dst.size(); i++)
3158             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
3159          vec->definitions[0] = Definition(dst);
3160          bld.insert(std::move(vec));
3161       }
3162
3163    } else {
3164       unreachable("Shader stage not implemented");
3165    }
3166 }
3167
3168 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
3169 {
3170    if (ctx->program->info->need_indirect_descriptor_sets) {
3171       Builder bld(ctx->program, ctx->block);
3172       Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
3173       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
3174    }
3175
3176    return ctx->descriptor_sets[desc_set];
3177 }
3178
3179
3180 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
3181 {
3182    Builder bld(ctx->program, ctx->block);
3183    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
3184    if (!ctx->divergent_vals[instr->dest.ssa.index])
3185       index = bld.as_uniform(index);
3186    unsigned desc_set = nir_intrinsic_desc_set(instr);
3187    unsigned binding = nir_intrinsic_binding(instr);
3188
3189    Temp desc_ptr;
3190    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
3191    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
3192    unsigned offset = layout->binding[binding].offset;
3193    unsigned stride;
3194    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3195        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
3196       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
3197       desc_ptr = ctx->push_constants;
3198       offset = pipeline_layout->push_constant_size + 16 * idx;
3199       stride = 16;
3200    } else {
3201       desc_ptr = load_desc_ptr(ctx, desc_set);
3202       stride = layout->binding[binding].size;
3203    }
3204
3205    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
3206    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
3207    if (stride != 1) {
3208       if (nir_const_index) {
3209          const_index = const_index * stride;
3210       } else if (index.type() == RegType::vgpr) {
3211          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
3212          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
3213       } else {
3214          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
3215       }
3216    }
3217    if (offset) {
3218       if (nir_const_index) {
3219          const_index = const_index + offset;
3220       } else if (index.type() == RegType::vgpr) {
3221          index = bld.vadd32(bld.def(v1), Operand(offset), index);
3222       } else {
3223          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
3224       }
3225    }
3226
3227    if (nir_const_index && const_index == 0) {
3228       index = desc_ptr;
3229    } else if (index.type() == RegType::vgpr) {
3230       index = bld.vadd32(bld.def(v1),
3231                          nir_const_index ? Operand(const_index) : Operand(index),
3232                          Operand(desc_ptr));
3233    } else {
3234       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
3235                        nir_const_index ? Operand(const_index) : Operand(index),
3236                        Operand(desc_ptr));
3237    }
3238
3239    bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
3240 }
3241
3242 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false)
3243 {
3244    Builder bld(ctx->program, ctx->block);
3245
3246    unsigned num_bytes = dst.size() * 4;
3247    bool dlc = glc && ctx->options->chip_class >= GFX10;
3248
3249    aco_opcode op;
3250    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
3251       if (ctx->options->chip_class < GFX8)
3252          offset = as_vgpr(ctx, offset);
3253
3254       Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3255       Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3256       unsigned const_offset = 0;
3257
3258       Temp lower = Temp();
3259       if (num_bytes > 16) {
3260          assert(num_components == 3 || num_components == 4);
3261          op = aco_opcode::buffer_load_dwordx4;
3262          lower = bld.tmp(v4);
3263          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3264          mubuf->definitions[0] = Definition(lower);
3265          mubuf->operands[0] = vaddr;
3266          mubuf->operands[1] = Operand(rsrc);
3267          mubuf->operands[2] = soffset;
3268          mubuf->offen = (offset.type() == RegType::vgpr);
3269          mubuf->glc = glc;
3270          mubuf->dlc = dlc;
3271          mubuf->barrier = barrier_buffer;
3272          bld.insert(std::move(mubuf));
3273          emit_split_vector(ctx, lower, 2);
3274          num_bytes -= 16;
3275          const_offset = 16;
3276       }
3277
3278       switch (num_bytes) {
3279          case 4:
3280             op = aco_opcode::buffer_load_dword;
3281             break;
3282          case 8:
3283             op = aco_opcode::buffer_load_dwordx2;
3284             break;
3285          case 12:
3286             op = aco_opcode::buffer_load_dwordx3;
3287             break;
3288          case 16:
3289             op = aco_opcode::buffer_load_dwordx4;
3290             break;
3291          default:
3292             unreachable("Load SSBO not implemented for this size.");
3293       }
3294       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3295       mubuf->operands[0] = vaddr;
3296       mubuf->operands[1] = Operand(rsrc);
3297       mubuf->operands[2] = soffset;
3298       mubuf->offen = (offset.type() == RegType::vgpr);
3299       mubuf->glc = glc;
3300       mubuf->dlc = dlc;
3301       mubuf->barrier = barrier_buffer;
3302       mubuf->offset = const_offset;
3303       aco_ptr<Instruction> instr = std::move(mubuf);
3304
3305       if (dst.size() > 4) {
3306          assert(lower != Temp());
3307          Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3308          instr->definitions[0] = Definition(upper);
3309          bld.insert(std::move(instr));
3310          if (dst.size() == 8)
3311             emit_split_vector(ctx, upper, 2);
3312          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3313          instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3314          instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3315          instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3316          if (dst.size() == 8)
3317             instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3318       }
3319
3320       if (dst.type() == RegType::sgpr) {
3321          Temp vec = bld.tmp(RegType::vgpr, dst.size());
3322          instr->definitions[0] = Definition(vec);
3323          bld.insert(std::move(instr));
3324          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3325       } else {
3326          instr->definitions[0] = Definition(dst);
3327          bld.insert(std::move(instr));
3328       }
3329    } else {
3330       switch (num_bytes) {
3331          case 4:
3332             op = aco_opcode::s_buffer_load_dword;
3333             break;
3334          case 8:
3335             op = aco_opcode::s_buffer_load_dwordx2;
3336             break;
3337          case 12:
3338          case 16:
3339             op = aco_opcode::s_buffer_load_dwordx4;
3340             break;
3341          case 24:
3342          case 32:
3343             op = aco_opcode::s_buffer_load_dwordx8;
3344             break;
3345          default:
3346             unreachable("Load SSBO not implemented for this size.");
3347       }
3348       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3349       load->operands[0] = Operand(rsrc);
3350       load->operands[1] = Operand(bld.as_uniform(offset));
3351       assert(load->operands[1].getTemp().type() == RegType::sgpr);
3352       load->definitions[0] = Definition(dst);
3353       load->glc = glc;
3354       load->dlc = dlc;
3355       load->barrier = barrier_buffer;
3356       assert(ctx->options->chip_class >= GFX8 || !glc);
3357
3358       /* trim vector */
3359       if (dst.size() == 3) {
3360          Temp vec = bld.tmp(s4);
3361          load->definitions[0] = Definition(vec);
3362          bld.insert(std::move(load));
3363          emit_split_vector(ctx, vec, 4);
3364
3365          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3366                     emit_extract_vector(ctx, vec, 0, s1),
3367                     emit_extract_vector(ctx, vec, 1, s1),
3368                     emit_extract_vector(ctx, vec, 2, s1));
3369       } else if (dst.size() == 6) {
3370          Temp vec = bld.tmp(s8);
3371          load->definitions[0] = Definition(vec);
3372          bld.insert(std::move(load));
3373          emit_split_vector(ctx, vec, 4);
3374
3375          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3376                     emit_extract_vector(ctx, vec, 0, s2),
3377                     emit_extract_vector(ctx, vec, 1, s2),
3378                     emit_extract_vector(ctx, vec, 2, s2));
3379       } else {
3380          bld.insert(std::move(load));
3381       }
3382
3383    }
3384    emit_split_vector(ctx, dst, num_components);
3385 }
3386
3387 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3388 {
3389    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3390    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3391
3392    Builder bld(ctx->program, ctx->block);
3393
3394    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3395    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3396    unsigned binding = nir_intrinsic_binding(idx_instr);
3397    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3398
3399    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3400       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3401                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3402                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3403                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3404       if (ctx->options->chip_class >= GFX10) {
3405          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3406                       S_008F0C_OOB_SELECT(3) |
3407                       S_008F0C_RESOURCE_LEVEL(1);
3408       } else {
3409          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3410                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3411       }
3412       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3413                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3414                                      Operand(0xFFFFFFFFu),
3415                                      Operand(desc_type));
3416       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3417                         rsrc, upper_dwords);
3418    } else {
3419       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3420       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3421    }
3422
3423    load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3424 }
3425
3426 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3427 {
3428    Builder bld(ctx->program, ctx->block);
3429    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3430
3431    unsigned offset = nir_intrinsic_base(instr);
3432    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3433    if (index_cv && instr->dest.ssa.bit_size == 32) {
3434
3435       unsigned count = instr->dest.ssa.num_components;
3436       unsigned start = (offset + index_cv->u32) / 4u;
3437       start -= ctx->base_inline_push_consts;
3438       if (start + count <= ctx->num_inline_push_consts) {
3439          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3440          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3441          for (unsigned i = 0; i < count; ++i) {
3442             elems[i] = ctx->inline_push_consts[start + i];
3443             vec->operands[i] = Operand{elems[i]};
3444          }
3445          vec->definitions[0] = Definition(dst);
3446          ctx->block->instructions.emplace_back(std::move(vec));
3447          ctx->allocated_vec.emplace(dst.id(), elems);
3448          return;
3449       }
3450    }
3451
3452    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3453    if (offset != 0) // TODO check if index != 0 as well
3454       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3455    Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
3456    Temp vec = dst;
3457    bool trim = false;
3458    aco_opcode op;
3459
3460    switch (dst.size()) {
3461    case 1:
3462       op = aco_opcode::s_load_dword;
3463       break;
3464    case 2:
3465       op = aco_opcode::s_load_dwordx2;
3466       break;
3467    case 3:
3468       vec = bld.tmp(s4);
3469       trim = true;
3470    case 4:
3471       op = aco_opcode::s_load_dwordx4;
3472       break;
3473    case 6:
3474       vec = bld.tmp(s8);
3475       trim = true;
3476    case 8:
3477       op = aco_opcode::s_load_dwordx8;
3478       break;
3479    default:
3480       unreachable("unimplemented or forbidden load_push_constant.");
3481    }
3482
3483    bld.smem(op, Definition(vec), ptr, index);
3484
3485    if (trim) {
3486       emit_split_vector(ctx, vec, 4);
3487       RegClass rc = dst.size() == 3 ? s1 : s2;
3488       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3489                  emit_extract_vector(ctx, vec, 0, rc),
3490                  emit_extract_vector(ctx, vec, 1, rc),
3491                  emit_extract_vector(ctx, vec, 2, rc));
3492
3493    }
3494    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3495 }
3496
3497 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3498 {
3499    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3500
3501    Builder bld(ctx->program, ctx->block);
3502
3503    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3504                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3505                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3506                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3507    if (ctx->options->chip_class >= GFX10) {
3508       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3509                    S_008F0C_OOB_SELECT(3) |
3510                    S_008F0C_RESOURCE_LEVEL(1);
3511    } else {
3512       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3513                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3514    }
3515
3516    unsigned base = nir_intrinsic_base(instr);
3517    unsigned range = nir_intrinsic_range(instr);
3518
3519    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3520    if (base && offset.type() == RegType::sgpr)
3521       offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3522    else if (base && offset.type() == RegType::vgpr)
3523       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3524
3525    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3526                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3527                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3528                           Operand(desc_type));
3529
3530    load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3531 }
3532
3533 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3534 {
3535    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3536       ctx->cf_info.exec_potentially_empty = true;
3537
3538    ctx->program->needs_exact = true;
3539
3540    // TODO: optimize uniform conditions
3541    Builder bld(ctx->program, ctx->block);
3542    Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
3543    src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3544    bld.pseudo(aco_opcode::p_discard_if, src);
3545    ctx->block->kind |= block_kind_uses_discard_if;
3546    return;
3547 }
3548
3549 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3550 {
3551    Builder bld(ctx->program, ctx->block);
3552
3553    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3554       ctx->cf_info.exec_potentially_empty = true;
3555
3556    bool divergent = ctx->cf_info.parent_if.is_divergent ||
3557                     ctx->cf_info.parent_loop.has_divergent_continue;
3558
3559    if (ctx->block->loop_nest_depth &&
3560        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3561       /* we handle discards the same way as jump instructions */
3562       append_logical_end(ctx->block);
3563
3564       /* in loops, discard behaves like break */
3565       Block *linear_target = ctx->cf_info.parent_loop.exit;
3566       ctx->block->kind |= block_kind_discard;
3567
3568       if (!divergent) {
3569          /* uniform discard - loop ends here */
3570          assert(nir_instr_is_last(&instr->instr));
3571          ctx->block->kind |= block_kind_uniform;
3572          ctx->cf_info.has_branch = true;
3573          bld.branch(aco_opcode::p_branch);
3574          add_linear_edge(ctx->block->index, linear_target);
3575          return;
3576       }
3577
3578       /* we add a break right behind the discard() instructions */
3579       ctx->block->kind |= block_kind_break;
3580       unsigned idx = ctx->block->index;
3581
3582       /* remove critical edges from linear CFG */
3583       bld.branch(aco_opcode::p_branch);
3584       Block* break_block = ctx->program->create_and_insert_block();
3585       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3586       break_block->kind |= block_kind_uniform;
3587       add_linear_edge(idx, break_block);
3588       add_linear_edge(break_block->index, linear_target);
3589       bld.reset(break_block);
3590       bld.branch(aco_opcode::p_branch);
3591
3592       Block* continue_block = ctx->program->create_and_insert_block();
3593       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3594       add_linear_edge(idx, continue_block);
3595       append_logical_start(continue_block);
3596       ctx->block = continue_block;
3597
3598       return;
3599    }
3600
3601    /* it can currently happen that NIR doesn't remove the unreachable code */
3602    if (!nir_instr_is_last(&instr->instr)) {
3603       ctx->program->needs_exact = true;
3604       /* save exec somewhere temporarily so that it doesn't get
3605        * overwritten before the discard from outer exec masks */
3606       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3607       bld.pseudo(aco_opcode::p_discard_if, cond);
3608       ctx->block->kind |= block_kind_uses_discard_if;
3609       return;
3610    }
3611
3612    /* This condition is incorrect for uniformly branched discards in a loop
3613     * predicated by a divergent condition, but the above code catches that case
3614     * and the discard would end up turning into a discard_if.
3615     * For example:
3616     * if (divergent) {
3617     *    while (...) {
3618     *       if (uniform) {
3619     *          discard;
3620     *       }
3621     *    }
3622     * }
3623     */
3624    if (!ctx->cf_info.parent_if.is_divergent) {
3625       /* program just ends here */
3626       ctx->block->kind |= block_kind_uniform;
3627       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3628               0 /* enabled mask */, 9 /* dest */,
3629               false /* compressed */, true/* done */, true /* valid mask */);
3630       bld.sopp(aco_opcode::s_endpgm);
3631       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3632    } else {
3633       ctx->block->kind |= block_kind_discard;
3634       /* branch and linear edge is added by visit_if() */
3635    }
3636 }
3637
3638 enum aco_descriptor_type {
3639    ACO_DESC_IMAGE,
3640    ACO_DESC_FMASK,
3641    ACO_DESC_SAMPLER,
3642    ACO_DESC_BUFFER,
3643    ACO_DESC_PLANE_0,
3644    ACO_DESC_PLANE_1,
3645    ACO_DESC_PLANE_2,
3646 };
3647
3648 static bool
3649 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3650    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3651       return false;
3652    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3653    return dim == ac_image_cube ||
3654           dim == ac_image_1darray ||
3655           dim == ac_image_2darray ||
3656           dim == ac_image_2darraymsaa;
3657 }
3658
3659 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3660                       enum aco_descriptor_type desc_type,
3661                       const nir_tex_instr *tex_instr, bool image, bool write)
3662 {
3663 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3664    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3665    if (it != ctx->tex_desc.end())
3666       return it->second;
3667 */
3668    Temp index = Temp();
3669    bool index_set = false;
3670    unsigned constant_index = 0;
3671    unsigned descriptor_set;
3672    unsigned base_index;
3673    Builder bld(ctx->program, ctx->block);
3674
3675    if (!deref_instr) {
3676       assert(tex_instr && !image);
3677       descriptor_set = 0;
3678       base_index = tex_instr->sampler_index;
3679    } else {
3680       while(deref_instr->deref_type != nir_deref_type_var) {
3681          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3682          if (!array_size)
3683             array_size = 1;
3684
3685          assert(deref_instr->deref_type == nir_deref_type_array);
3686          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3687          if (const_value) {
3688             constant_index += array_size * const_value->u32;
3689          } else {
3690             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
3691             if (indirect.type() == RegType::vgpr)
3692                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
3693
3694             if (array_size != 1)
3695                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3696
3697             if (!index_set) {
3698                index = indirect;
3699                index_set = true;
3700             } else {
3701                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3702             }
3703          }
3704
3705          deref_instr = nir_src_as_deref(deref_instr->parent);
3706       }
3707       descriptor_set = deref_instr->var->data.descriptor_set;
3708       base_index = deref_instr->var->data.binding;
3709    }
3710
3711    Temp list = load_desc_ptr(ctx, descriptor_set);
3712    list = convert_pointer_to_64_bit(ctx, list);
3713
3714    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3715    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3716    unsigned offset = binding->offset;
3717    unsigned stride = binding->size;
3718    aco_opcode opcode;
3719    RegClass type;
3720
3721    assert(base_index < layout->binding_count);
3722
3723    switch (desc_type) {
3724    case ACO_DESC_IMAGE:
3725       type = s8;
3726       opcode = aco_opcode::s_load_dwordx8;
3727       break;
3728    case ACO_DESC_FMASK:
3729       type = s8;
3730       opcode = aco_opcode::s_load_dwordx8;
3731       offset += 32;
3732       break;
3733    case ACO_DESC_SAMPLER:
3734       type = s4;
3735       opcode = aco_opcode::s_load_dwordx4;
3736       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3737          offset += radv_combined_image_descriptor_sampler_offset(binding);
3738       break;
3739    case ACO_DESC_BUFFER:
3740       type = s4;
3741       opcode = aco_opcode::s_load_dwordx4;
3742       break;
3743    case ACO_DESC_PLANE_0:
3744    case ACO_DESC_PLANE_1:
3745       type = s8;
3746       opcode = aco_opcode::s_load_dwordx8;
3747       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3748       break;
3749    case ACO_DESC_PLANE_2:
3750       type = s4;
3751       opcode = aco_opcode::s_load_dwordx4;
3752       offset += 64;
3753       break;
3754    default:
3755       unreachable("invalid desc_type\n");
3756    }
3757
3758    offset += constant_index * stride;
3759
3760    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3761       (!index_set || binding->immutable_samplers_equal)) {
3762       if (binding->immutable_samplers_equal)
3763          constant_index = 0;
3764
3765       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3766       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3767                         Operand(samplers[constant_index * 4 + 0]),
3768                         Operand(samplers[constant_index * 4 + 1]),
3769                         Operand(samplers[constant_index * 4 + 2]),
3770                         Operand(samplers[constant_index * 4 + 3]));
3771    }
3772
3773    Operand off;
3774    if (!index_set) {
3775       off = Operand(offset);
3776    } else {
3777       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3778                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3779    }
3780
3781    Temp res = bld.smem(opcode, bld.def(type), list, off);
3782
3783    if (desc_type == ACO_DESC_PLANE_2) {
3784       Temp components[8];
3785       for (unsigned i = 0; i < 8; i++)
3786          components[i] = bld.tmp(s1);
3787       bld.pseudo(aco_opcode::p_split_vector,
3788                  Definition(components[0]),
3789                  Definition(components[1]),
3790                  Definition(components[2]),
3791                  Definition(components[3]),
3792                  res);
3793
3794       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3795       bld.pseudo(aco_opcode::p_split_vector,
3796                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3797                  Definition(components[4]),
3798                  Definition(components[5]),
3799                  Definition(components[6]),
3800                  Definition(components[7]),
3801                  desc2);
3802
3803       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3804                        components[0], components[1], components[2], components[3],
3805                        components[4], components[5], components[6], components[7]);
3806    }
3807
3808    return res;
3809 }
3810
3811 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3812 {
3813    switch (dim) {
3814    case GLSL_SAMPLER_DIM_BUF:
3815       return 1;
3816    case GLSL_SAMPLER_DIM_1D:
3817       return array ? 2 : 1;
3818    case GLSL_SAMPLER_DIM_2D:
3819       return array ? 3 : 2;
3820    case GLSL_SAMPLER_DIM_MS:
3821       return array ? 4 : 3;
3822    case GLSL_SAMPLER_DIM_3D:
3823    case GLSL_SAMPLER_DIM_CUBE:
3824       return 3;
3825    case GLSL_SAMPLER_DIM_RECT:
3826    case GLSL_SAMPLER_DIM_SUBPASS:
3827       return 2;
3828    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3829       return 3;
3830    default:
3831       break;
3832    }
3833    return 0;
3834 }
3835
3836
3837 /* Adjust the sample index according to FMASK.
3838  *
3839  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3840  * which is the identity mapping. Each nibble says which physical sample
3841  * should be fetched to get that sample.
3842  *
3843  * For example, 0x11111100 means there are only 2 samples stored and
3844  * the second sample covers 3/4 of the pixel. When reading samples 0
3845  * and 1, return physical sample 0 (determined by the first two 0s
3846  * in FMASK), otherwise return physical sample 1.
3847  *
3848  * The sample index should be adjusted as follows:
3849  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
3850  */
3851 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3852 {
3853    Builder bld(ctx->program, ctx->block);
3854    Temp fmask = bld.tmp(v1);
3855    unsigned dim = ctx->options->chip_class >= GFX10
3856                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3857                   : 0;
3858
3859    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3860    load->operands[0] = Operand(coords);
3861    load->operands[1] = Operand(fmask_desc_ptr);
3862    load->definitions[0] = Definition(fmask);
3863    load->glc = false;
3864    load->dlc = false;
3865    load->dmask = 0x1;
3866    load->unrm = true;
3867    load->da = da;
3868    load->dim = dim;
3869    load->can_reorder = true; /* fmask images shouldn't be modified */
3870    ctx->block->instructions.emplace_back(std::move(load));
3871
3872    Operand sample_index4;
3873    if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3874       sample_index4 = Operand(sample_index.constantValue() << 2);
3875    } else if (sample_index.regClass() == s1) {
3876       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3877    } else {
3878       assert(sample_index.regClass() == v1);
3879       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3880    }
3881
3882    Temp final_sample;
3883    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3884       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3885    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3886       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3887    else
3888       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3889
3890    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3891     * resource descriptor is 0 (invalid),
3892     */
3893    Temp compare = bld.tmp(s2);
3894    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3895                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3896
3897    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3898
3899    /* Replace the MSAA sample index. */
3900    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3901 }
3902
3903 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3904 {
3905
3906    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3907    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3908    bool is_array = glsl_sampler_type_is_array(type);
3909    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3910    assert(!add_frag_pos && "Input attachments should be lowered.");
3911    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3912    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3913    int count = image_type_to_components_count(dim, is_array);
3914    std::vector<Operand> coords(count);
3915
3916    if (is_ms) {
3917       Operand sample_index;
3918       nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3919       if (sample_cv)
3920          sample_index = Operand(sample_cv->u32);
3921       else
3922          sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3923
3924       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3925          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3926          for (unsigned i = 0; i < vec->operands.size(); i++)
3927             vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3928          Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3929          vec->definitions[0] = Definition(fmask_load_address);
3930          ctx->block->instructions.emplace_back(std::move(vec));
3931
3932          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3933          sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3934       }
3935       count--;
3936       coords[count] = sample_index;
3937    }
3938
3939    if (count == 1 && !gfx9_1d)
3940       return emit_extract_vector(ctx, src0, 0, v1);
3941
3942    if (gfx9_1d) {
3943       coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3944       coords.resize(coords.size() + 1);
3945       coords[1] = Operand((uint32_t) 0);
3946       if (is_array)
3947          coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3948    } else {
3949       for (int i = 0; i < count; i++)
3950          coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3951    }
3952
3953    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
3954    for (unsigned i = 0; i < coords.size(); i++)
3955       vec->operands[i] = coords[i];
3956    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
3957    vec->definitions[0] = Definition(res);
3958    ctx->block->instructions.emplace_back(std::move(vec));
3959    return res;
3960 }
3961
3962
3963 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
3964 {
3965    Builder bld(ctx->program, ctx->block);
3966    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3967    const struct glsl_type *type = glsl_without_array(var->type);
3968    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3969    bool is_array = glsl_sampler_type_is_array(type);
3970    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3971
3972    if (dim == GLSL_SAMPLER_DIM_BUF) {
3973       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
3974       unsigned num_channels = util_last_bit(mask);
3975       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3976       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3977
3978       aco_opcode opcode;
3979       switch (num_channels) {
3980       case 1:
3981          opcode = aco_opcode::buffer_load_format_x;
3982          break;
3983       case 2:
3984          opcode = aco_opcode::buffer_load_format_xy;
3985          break;
3986       case 3:
3987          opcode = aco_opcode::buffer_load_format_xyz;
3988          break;
3989       case 4:
3990          opcode = aco_opcode::buffer_load_format_xyzw;
3991          break;
3992       default:
3993          unreachable(">4 channel buffer image load");
3994       }
3995       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
3996       load->operands[0] = Operand(vindex);
3997       load->operands[1] = Operand(rsrc);
3998       load->operands[2] = Operand((uint32_t) 0);
3999       Temp tmp;
4000       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4001          tmp = dst;
4002       else
4003          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
4004       load->definitions[0] = Definition(tmp);
4005       load->idxen = true;
4006       load->barrier = barrier_image;
4007       ctx->block->instructions.emplace_back(std::move(load));
4008
4009       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
4010       return;
4011    }
4012
4013    Temp coords = get_image_coords(ctx, instr, type);
4014    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4015
4016    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
4017    unsigned num_components = util_bitcount(dmask);
4018    Temp tmp;
4019    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
4020       tmp = dst;
4021    else
4022       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
4023
4024    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
4025    load->operands[0] = Operand(coords);
4026    load->operands[1] = Operand(resource);
4027    load->definitions[0] = Definition(tmp);
4028    load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
4029    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4030    load->dmask = dmask;
4031    load->unrm = true;
4032    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4033    load->barrier = barrier_image;
4034    ctx->block->instructions.emplace_back(std::move(load));
4035
4036    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
4037    return;
4038 }
4039
4040 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
4041 {
4042    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4043    const struct glsl_type *type = glsl_without_array(var->type);
4044    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4045    bool is_array = glsl_sampler_type_is_array(type);
4046    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4047
4048    bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
4049
4050    if (dim == GLSL_SAMPLER_DIM_BUF) {
4051       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4052       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4053       aco_opcode opcode;
4054       switch (data.size()) {
4055       case 1:
4056          opcode = aco_opcode::buffer_store_format_x;
4057          break;
4058       case 2:
4059          opcode = aco_opcode::buffer_store_format_xy;
4060          break;
4061       case 3:
4062          opcode = aco_opcode::buffer_store_format_xyz;
4063          break;
4064       case 4:
4065          opcode = aco_opcode::buffer_store_format_xyzw;
4066          break;
4067       default:
4068          unreachable(">4 channel buffer image store");
4069       }
4070       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
4071       store->operands[0] = Operand(vindex);
4072       store->operands[1] = Operand(rsrc);
4073       store->operands[2] = Operand((uint32_t) 0);
4074       store->operands[3] = Operand(data);
4075       store->idxen = true;
4076       store->glc = glc;
4077       store->dlc = false;
4078       store->disable_wqm = true;
4079       store->barrier = barrier_image;
4080       ctx->program->needs_exact = true;
4081       ctx->block->instructions.emplace_back(std::move(store));
4082       return;
4083    }
4084
4085    assert(data.type() == RegType::vgpr);
4086    Temp coords = get_image_coords(ctx, instr, type);
4087    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4088
4089    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
4090    store->operands[0] = Operand(coords);
4091    store->operands[1] = Operand(resource);
4092    store->operands[2] = Operand(s4);
4093    store->operands[3] = Operand(data);
4094    store->glc = glc;
4095    store->dlc = false;
4096    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4097    store->dmask = (1 << data.size()) - 1;
4098    store->unrm = true;
4099    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4100    store->disable_wqm = true;
4101    store->barrier = barrier_image;
4102    ctx->program->needs_exact = true;
4103    ctx->block->instructions.emplace_back(std::move(store));
4104    return;
4105 }
4106
4107 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4108 {
4109    /* return the previous value if dest is ever used */
4110    bool return_previous = false;
4111    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4112       return_previous = true;
4113       break;
4114    }
4115    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4116       return_previous = true;
4117       break;
4118    }
4119
4120    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4121    const struct glsl_type *type = glsl_without_array(var->type);
4122    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4123    bool is_array = glsl_sampler_type_is_array(type);
4124    Builder bld(ctx->program, ctx->block);
4125
4126    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
4127    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
4128
4129    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
4130       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
4131
4132    aco_opcode buf_op, image_op;
4133    switch (instr->intrinsic) {
4134       case nir_intrinsic_image_deref_atomic_add:
4135          buf_op = aco_opcode::buffer_atomic_add;
4136          image_op = aco_opcode::image_atomic_add;
4137          break;
4138       case nir_intrinsic_image_deref_atomic_umin:
4139          buf_op = aco_opcode::buffer_atomic_umin;
4140          image_op = aco_opcode::image_atomic_umin;
4141          break;
4142       case nir_intrinsic_image_deref_atomic_imin:
4143          buf_op = aco_opcode::buffer_atomic_smin;
4144          image_op = aco_opcode::image_atomic_smin;
4145          break;
4146       case nir_intrinsic_image_deref_atomic_umax:
4147          buf_op = aco_opcode::buffer_atomic_umax;
4148          image_op = aco_opcode::image_atomic_umax;
4149          break;
4150       case nir_intrinsic_image_deref_atomic_imax:
4151          buf_op = aco_opcode::buffer_atomic_smax;
4152          image_op = aco_opcode::image_atomic_smax;
4153          break;
4154       case nir_intrinsic_image_deref_atomic_and:
4155          buf_op = aco_opcode::buffer_atomic_and;
4156          image_op = aco_opcode::image_atomic_and;
4157          break;
4158       case nir_intrinsic_image_deref_atomic_or:
4159          buf_op = aco_opcode::buffer_atomic_or;
4160          image_op = aco_opcode::image_atomic_or;
4161          break;
4162       case nir_intrinsic_image_deref_atomic_xor:
4163          buf_op = aco_opcode::buffer_atomic_xor;
4164          image_op = aco_opcode::image_atomic_xor;
4165          break;
4166       case nir_intrinsic_image_deref_atomic_exchange:
4167          buf_op = aco_opcode::buffer_atomic_swap;
4168          image_op = aco_opcode::image_atomic_swap;
4169          break;
4170       case nir_intrinsic_image_deref_atomic_comp_swap:
4171          buf_op = aco_opcode::buffer_atomic_cmpswap;
4172          image_op = aco_opcode::image_atomic_cmpswap;
4173          break;
4174       default:
4175          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
4176    }
4177
4178    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4179
4180    if (dim == GLSL_SAMPLER_DIM_BUF) {
4181       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
4182       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
4183       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
4184       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4185       mubuf->operands[0] = Operand(vindex);
4186       mubuf->operands[1] = Operand(resource);
4187       mubuf->operands[2] = Operand((uint32_t)0);
4188       mubuf->operands[3] = Operand(data);
4189       if (return_previous)
4190          mubuf->definitions[0] = Definition(dst);
4191       mubuf->offset = 0;
4192       mubuf->idxen = true;
4193       mubuf->glc = return_previous;
4194       mubuf->dlc = false; /* Not needed for atomics */
4195       mubuf->disable_wqm = true;
4196       mubuf->barrier = barrier_image;
4197       ctx->program->needs_exact = true;
4198       ctx->block->instructions.emplace_back(std::move(mubuf));
4199       return;
4200    }
4201
4202    Temp coords = get_image_coords(ctx, instr, type);
4203    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
4204    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
4205    mimg->operands[0] = Operand(coords);
4206    mimg->operands[1] = Operand(resource);
4207    mimg->operands[2] = Operand(s4); /* no sampler */
4208    mimg->operands[3] = Operand(data);
4209    if (return_previous)
4210       mimg->definitions[0] = Definition(dst);
4211    mimg->glc = return_previous;
4212    mimg->dlc = false; /* Not needed for atomics */
4213    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4214    mimg->dmask = (1 << data.size()) - 1;
4215    mimg->unrm = true;
4216    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
4217    mimg->disable_wqm = true;
4218    mimg->barrier = barrier_image;
4219    ctx->program->needs_exact = true;
4220    ctx->block->instructions.emplace_back(std::move(mimg));
4221    return;
4222 }
4223
4224 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
4225 {
4226    if (in_elements && ctx->options->chip_class == GFX8) {
4227       Builder bld(ctx->program, ctx->block);
4228
4229       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
4230       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
4231       stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
4232       stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
4233
4234       Temp size = emit_extract_vector(ctx, desc, 2, s1);
4235       size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
4236
4237       Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
4238       res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
4239       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4240
4241       // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
4242       /* idea
4243        * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
4244        * in case 12 (or 3?), we have to divide by 3:
4245        * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
4246        * use v_mul_hi_u32 with magic number to divide
4247        * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4248        * disable v_skip
4249        * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4250        */
4251
4252    } else {
4253       emit_extract_vector(ctx, desc, 2, dst);
4254    }
4255 }
4256
4257 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4258 {
4259    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4260    const struct glsl_type *type = glsl_without_array(var->type);
4261    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4262    bool is_array = glsl_sampler_type_is_array(type);
4263    Builder bld(ctx->program, ctx->block);
4264
4265    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4266       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4267       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4268    }
4269
4270    /* LOD */
4271    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4272
4273    /* Resource */
4274    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4275
4276    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4277
4278    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4279    mimg->operands[0] = Operand(lod);
4280    mimg->operands[1] = Operand(resource);
4281    unsigned& dmask = mimg->dmask;
4282    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4283    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4284    mimg->da = glsl_sampler_type_is_array(type);
4285    mimg->can_reorder = true;
4286    Definition& def = mimg->definitions[0];
4287    ctx->block->instructions.emplace_back(std::move(mimg));
4288
4289    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4290        glsl_sampler_type_is_array(type)) {
4291
4292       assert(instr->dest.ssa.num_components == 3);
4293       Temp tmp = {ctx->program->allocateId(), v3};
4294       def = Definition(tmp);
4295       emit_split_vector(ctx, tmp, 3);
4296
4297       /* divide 3rd value by 6 by multiplying with magic number */
4298       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4299       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4300
4301       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4302                  emit_extract_vector(ctx, tmp, 0, v1),
4303                  emit_extract_vector(ctx, tmp, 1, v1),
4304                  by_6);
4305
4306    } else if (ctx->options->chip_class == GFX9 &&
4307               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4308               glsl_sampler_type_is_array(type)) {
4309       assert(instr->dest.ssa.num_components == 2);
4310       def = Definition(dst);
4311       dmask = 0x5;
4312    } else {
4313       def = Definition(dst);
4314    }
4315
4316    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4317 }
4318
4319 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4320 {
4321    Builder bld(ctx->program, ctx->block);
4322    unsigned num_components = instr->num_components;
4323
4324    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4325    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4326    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4327
4328    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4329    load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc);
4330 }
4331
4332 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4333 {
4334    Builder bld(ctx->program, ctx->block);
4335    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4336    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4337    unsigned writemask = nir_intrinsic_write_mask(instr);
4338
4339    Temp offset;
4340    if (ctx->options->chip_class < GFX8)
4341       offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4342    else
4343       offset = get_ssa_temp(ctx, instr->src[2].ssa);
4344
4345    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4346    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4347
4348    bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4349                ctx->options->chip_class >= GFX8;
4350    if (smem)
4351       offset = bld.as_uniform(offset);
4352    bool smem_nonfs = smem && ctx->stage != fragment_fs;
4353
4354    while (writemask) {
4355       int start, count;
4356       u_bit_scan_consecutive_range(&writemask, &start, &count);
4357       if (count == 3 && smem) {
4358          writemask |= 1u << (start + 2);
4359          count = 2;
4360       }
4361       int num_bytes = count * elem_size_bytes;
4362
4363       if (num_bytes > 16) {
4364          assert(elem_size_bytes == 8);
4365          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4366          count = 2;
4367          num_bytes = 16;
4368       }
4369
4370       // TODO: check alignment of sub-dword stores
4371       // TODO: split 3 bytes. there is no store instruction for that
4372
4373       Temp write_data;
4374       if (count != instr->num_components) {
4375          emit_split_vector(ctx, data, instr->num_components);
4376          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4377          for (int i = 0; i < count; i++) {
4378             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4379             vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4380          }
4381          write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4382          vec->definitions[0] = Definition(write_data);
4383          ctx->block->instructions.emplace_back(std::move(vec));
4384       } else if (!smem && data.type() != RegType::vgpr) {
4385          assert(num_bytes % 4 == 0);
4386          write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4387       } else if (smem_nonfs && data.type() == RegType::vgpr) {
4388          assert(num_bytes % 4 == 0);
4389          write_data = bld.as_uniform(data);
4390       } else {
4391          write_data = data;
4392       }
4393
4394       aco_opcode vmem_op, smem_op;
4395       switch (num_bytes) {
4396          case 4:
4397             vmem_op = aco_opcode::buffer_store_dword;
4398             smem_op = aco_opcode::s_buffer_store_dword;
4399             break;
4400          case 8:
4401             vmem_op = aco_opcode::buffer_store_dwordx2;
4402             smem_op = aco_opcode::s_buffer_store_dwordx2;
4403             break;
4404          case 12:
4405             vmem_op = aco_opcode::buffer_store_dwordx3;
4406             smem_op = aco_opcode::last_opcode;
4407             assert(!smem);
4408             break;
4409          case 16:
4410             vmem_op = aco_opcode::buffer_store_dwordx4;
4411             smem_op = aco_opcode::s_buffer_store_dwordx4;
4412             break;
4413          default:
4414             unreachable("Store SSBO not implemented for this size.");
4415       }
4416       if (ctx->stage == fragment_fs)
4417          smem_op = aco_opcode::p_fs_buffer_store_smem;
4418
4419       if (smem) {
4420          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4421          store->operands[0] = Operand(rsrc);
4422          if (start) {
4423             Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4424                                 offset, Operand(start * elem_size_bytes));
4425             store->operands[1] = Operand(off);
4426          } else {
4427             store->operands[1] = Operand(offset);
4428          }
4429          if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4430             store->operands[1].setFixed(m0);
4431          store->operands[2] = Operand(write_data);
4432          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4433          store->dlc = false;
4434          store->disable_wqm = true;
4435          store->barrier = barrier_buffer;
4436          ctx->block->instructions.emplace_back(std::move(store));
4437          ctx->program->wb_smem_l1_on_end = true;
4438          if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4439             ctx->block->kind |= block_kind_needs_lowering;
4440             ctx->program->needs_exact = true;
4441          }
4442       } else {
4443          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4444          store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4445          store->operands[1] = Operand(rsrc);
4446          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4447          store->operands[3] = Operand(write_data);
4448          store->offset = start * elem_size_bytes;
4449          store->offen = (offset.type() == RegType::vgpr);
4450          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4451          store->dlc = false;
4452          store->disable_wqm = true;
4453          store->barrier = barrier_buffer;
4454          ctx->program->needs_exact = true;
4455          ctx->block->instructions.emplace_back(std::move(store));
4456       }
4457    }
4458 }
4459
4460 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4461 {
4462    /* return the previous value if dest is ever used */
4463    bool return_previous = false;
4464    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4465       return_previous = true;
4466       break;
4467    }
4468    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4469       return_previous = true;
4470       break;
4471    }
4472
4473    Builder bld(ctx->program, ctx->block);
4474    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4475
4476    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4477       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4478                         get_ssa_temp(ctx, instr->src[3].ssa), data);
4479
4480    Temp offset;
4481    if (ctx->options->chip_class < GFX8)
4482       offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4483    else
4484       offset = get_ssa_temp(ctx, instr->src[1].ssa);
4485
4486    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4487    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4488
4489    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4490
4491    aco_opcode op32, op64;
4492    switch (instr->intrinsic) {
4493       case nir_intrinsic_ssbo_atomic_add:
4494          op32 = aco_opcode::buffer_atomic_add;
4495          op64 = aco_opcode::buffer_atomic_add_x2;
4496          break;
4497       case nir_intrinsic_ssbo_atomic_imin:
4498          op32 = aco_opcode::buffer_atomic_smin;
4499          op64 = aco_opcode::buffer_atomic_smin_x2;
4500          break;
4501       case nir_intrinsic_ssbo_atomic_umin:
4502          op32 = aco_opcode::buffer_atomic_umin;
4503          op64 = aco_opcode::buffer_atomic_umin_x2;
4504          break;
4505       case nir_intrinsic_ssbo_atomic_imax:
4506          op32 = aco_opcode::buffer_atomic_smax;
4507          op64 = aco_opcode::buffer_atomic_smax_x2;
4508          break;
4509       case nir_intrinsic_ssbo_atomic_umax:
4510          op32 = aco_opcode::buffer_atomic_umax;
4511          op64 = aco_opcode::buffer_atomic_umax_x2;
4512          break;
4513       case nir_intrinsic_ssbo_atomic_and:
4514          op32 = aco_opcode::buffer_atomic_and;
4515          op64 = aco_opcode::buffer_atomic_and_x2;
4516          break;
4517       case nir_intrinsic_ssbo_atomic_or:
4518          op32 = aco_opcode::buffer_atomic_or;
4519          op64 = aco_opcode::buffer_atomic_or_x2;
4520          break;
4521       case nir_intrinsic_ssbo_atomic_xor:
4522          op32 = aco_opcode::buffer_atomic_xor;
4523          op64 = aco_opcode::buffer_atomic_xor_x2;
4524          break;
4525       case nir_intrinsic_ssbo_atomic_exchange:
4526          op32 = aco_opcode::buffer_atomic_swap;
4527          op64 = aco_opcode::buffer_atomic_swap_x2;
4528          break;
4529       case nir_intrinsic_ssbo_atomic_comp_swap:
4530          op32 = aco_opcode::buffer_atomic_cmpswap;
4531          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4532          break;
4533       default:
4534          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4535    }
4536    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4537    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4538    mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4539    mubuf->operands[1] = Operand(rsrc);
4540    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4541    mubuf->operands[3] = Operand(data);
4542    if (return_previous)
4543       mubuf->definitions[0] = Definition(dst);
4544    mubuf->offset = 0;
4545    mubuf->offen = (offset.type() == RegType::vgpr);
4546    mubuf->glc = return_previous;
4547    mubuf->dlc = false; /* Not needed for atomics */
4548    mubuf->disable_wqm = true;
4549    mubuf->barrier = barrier_buffer;
4550    ctx->program->needs_exact = true;
4551    ctx->block->instructions.emplace_back(std::move(mubuf));
4552 }
4553
4554 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4555
4556    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4557    Builder bld(ctx->program, ctx->block);
4558    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4559    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4560 }
4561
4562 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4563 {
4564    Builder bld(ctx->program, ctx->block);
4565    unsigned num_components = instr->num_components;
4566    unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4567
4568    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4569    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4570
4571    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4572    bool dlc = glc && ctx->options->chip_class >= GFX10;
4573    aco_opcode op;
4574    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4575       bool global = ctx->options->chip_class >= GFX9;
4576       aco_opcode op;
4577       switch (num_bytes) {
4578       case 4:
4579          op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4580          break;
4581       case 8:
4582          op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4583          break;
4584       case 12:
4585          op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4586          break;
4587       case 16:
4588          op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4589          break;
4590       default:
4591          unreachable("load_global not implemented for this size.");
4592       }
4593       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4594       flat->operands[0] = Operand(addr);
4595       flat->operands[1] = Operand(s1);
4596       flat->glc = glc;
4597       flat->dlc = dlc;
4598
4599       if (dst.type() == RegType::sgpr) {
4600          Temp vec = bld.tmp(RegType::vgpr, dst.size());
4601          flat->definitions[0] = Definition(vec);
4602          ctx->block->instructions.emplace_back(std::move(flat));
4603          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4604       } else {
4605          flat->definitions[0] = Definition(dst);
4606          ctx->block->instructions.emplace_back(std::move(flat));
4607       }
4608       emit_split_vector(ctx, dst, num_components);
4609    } else {
4610       switch (num_bytes) {
4611          case 4:
4612             op = aco_opcode::s_load_dword;
4613             break;
4614          case 8:
4615             op = aco_opcode::s_load_dwordx2;
4616             break;
4617          case 12:
4618          case 16:
4619             op = aco_opcode::s_load_dwordx4;
4620             break;
4621          default:
4622             unreachable("load_global not implemented for this size.");
4623       }
4624       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4625       load->operands[0] = Operand(addr);
4626       load->operands[1] = Operand(0u);
4627       load->definitions[0] = Definition(dst);
4628       load->glc = glc;
4629       load->dlc = dlc;
4630       load->barrier = barrier_buffer;
4631       assert(ctx->options->chip_class >= GFX8 || !glc);
4632
4633       if (dst.size() == 3) {
4634          /* trim vector */
4635          Temp vec = bld.tmp(s4);
4636          load->definitions[0] = Definition(vec);
4637          ctx->block->instructions.emplace_back(std::move(load));
4638          emit_split_vector(ctx, vec, 4);
4639
4640          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4641                     emit_extract_vector(ctx, vec, 0, s1),
4642                     emit_extract_vector(ctx, vec, 1, s1),
4643                     emit_extract_vector(ctx, vec, 2, s1));
4644       } else {
4645          ctx->block->instructions.emplace_back(std::move(load));
4646       }
4647    }
4648 }
4649
4650 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4651 {
4652    Builder bld(ctx->program, ctx->block);
4653    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4654
4655    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4656    Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4657
4658    unsigned writemask = nir_intrinsic_write_mask(instr);
4659    while (writemask) {
4660       int start, count;
4661       u_bit_scan_consecutive_range(&writemask, &start, &count);
4662       unsigned num_bytes = count * elem_size_bytes;
4663
4664       Temp write_data = data;
4665       if (count != instr->num_components) {
4666          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4667          for (int i = 0; i < count; i++)
4668             vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4669          write_data = bld.tmp(RegType::vgpr, count);
4670          vec->definitions[0] = Definition(write_data);
4671          ctx->block->instructions.emplace_back(std::move(vec));
4672       }
4673
4674       unsigned offset = start * elem_size_bytes;
4675       if (offset > 0 && ctx->options->chip_class < GFX9) {
4676          Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4677          Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4678          Temp carry = bld.tmp(s2);
4679          bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4680
4681          bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4682                   Operand(offset), addr0);
4683          bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4684                   Operand(0u), addr1,
4685                   carry).def(1).setHint(vcc);
4686
4687          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4688
4689          offset = 0;
4690       }
4691
4692       bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4693       bool global = ctx->options->chip_class >= GFX9;
4694       aco_opcode op;
4695       switch (num_bytes) {
4696       case 4:
4697          op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4698          break;
4699       case 8:
4700          op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4701          break;
4702       case 12:
4703          op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4704          break;
4705       case 16:
4706          op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4707          break;
4708       default:
4709          unreachable("store_global not implemented for this size.");
4710       }
4711       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4712       flat->operands[0] = Operand(addr);
4713       flat->operands[1] = Operand(s1);
4714       flat->operands[2] = Operand(data);
4715       flat->glc = glc;
4716       flat->dlc = false;
4717       flat->offset = offset;
4718       ctx->block->instructions.emplace_back(std::move(flat));
4719    }
4720 }
4721
4722 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4723    Builder bld(ctx->program, ctx->block);
4724    switch(instr->intrinsic) {
4725       case nir_intrinsic_group_memory_barrier:
4726       case nir_intrinsic_memory_barrier:
4727          bld.barrier(aco_opcode::p_memory_barrier_all);
4728          break;
4729       case nir_intrinsic_memory_barrier_atomic_counter:
4730          bld.barrier(aco_opcode::p_memory_barrier_atomic);
4731          break;
4732       case nir_intrinsic_memory_barrier_buffer:
4733          bld.barrier(aco_opcode::p_memory_barrier_buffer);
4734          break;
4735       case nir_intrinsic_memory_barrier_image:
4736          bld.barrier(aco_opcode::p_memory_barrier_image);
4737          break;
4738       case nir_intrinsic_memory_barrier_shared:
4739          bld.barrier(aco_opcode::p_memory_barrier_shared);
4740          break;
4741       default:
4742          unreachable("Unimplemented memory barrier intrinsic");
4743          break;
4744    }
4745 }
4746
4747 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4748 {
4749    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4750    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4751    assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4752    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4753    Builder bld(ctx->program, ctx->block);
4754
4755    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4756    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4757    load_lds(ctx, elem_size_bytes, dst, address, nir_intrinsic_base(instr), align);
4758 }
4759
4760 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4761 {
4762    unsigned writemask = nir_intrinsic_write_mask(instr);
4763    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4764    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4765    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4766    assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4767
4768    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4769    store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
4770 }
4771
4772 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4773 {
4774    unsigned offset = nir_intrinsic_base(instr);
4775    Operand m = load_lds_size_m0(ctx);
4776    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4777    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4778
4779    unsigned num_operands = 3;
4780    aco_opcode op32, op64, op32_rtn, op64_rtn;
4781    switch(instr->intrinsic) {
4782       case nir_intrinsic_shared_atomic_add:
4783          op32 = aco_opcode::ds_add_u32;
4784          op64 = aco_opcode::ds_add_u64;
4785          op32_rtn = aco_opcode::ds_add_rtn_u32;
4786          op64_rtn = aco_opcode::ds_add_rtn_u64;
4787          break;
4788       case nir_intrinsic_shared_atomic_imin:
4789          op32 = aco_opcode::ds_min_i32;
4790          op64 = aco_opcode::ds_min_i64;
4791          op32_rtn = aco_opcode::ds_min_rtn_i32;
4792          op64_rtn = aco_opcode::ds_min_rtn_i64;
4793          break;
4794       case nir_intrinsic_shared_atomic_umin:
4795          op32 = aco_opcode::ds_min_u32;
4796          op64 = aco_opcode::ds_min_u64;
4797          op32_rtn = aco_opcode::ds_min_rtn_u32;
4798          op64_rtn = aco_opcode::ds_min_rtn_u64;
4799          break;
4800       case nir_intrinsic_shared_atomic_imax:
4801          op32 = aco_opcode::ds_max_i32;
4802          op64 = aco_opcode::ds_max_i64;
4803          op32_rtn = aco_opcode::ds_max_rtn_i32;
4804          op64_rtn = aco_opcode::ds_max_rtn_i64;
4805          break;
4806       case nir_intrinsic_shared_atomic_umax:
4807          op32 = aco_opcode::ds_max_u32;
4808          op64 = aco_opcode::ds_max_u64;
4809          op32_rtn = aco_opcode::ds_max_rtn_u32;
4810          op64_rtn = aco_opcode::ds_max_rtn_u64;
4811          break;
4812       case nir_intrinsic_shared_atomic_and:
4813          op32 = aco_opcode::ds_and_b32;
4814          op64 = aco_opcode::ds_and_b64;
4815          op32_rtn = aco_opcode::ds_and_rtn_b32;
4816          op64_rtn = aco_opcode::ds_and_rtn_b64;
4817          break;
4818       case nir_intrinsic_shared_atomic_or:
4819          op32 = aco_opcode::ds_or_b32;
4820          op64 = aco_opcode::ds_or_b64;
4821          op32_rtn = aco_opcode::ds_or_rtn_b32;
4822          op64_rtn = aco_opcode::ds_or_rtn_b64;
4823          break;
4824       case nir_intrinsic_shared_atomic_xor:
4825          op32 = aco_opcode::ds_xor_b32;
4826          op64 = aco_opcode::ds_xor_b64;
4827          op32_rtn = aco_opcode::ds_xor_rtn_b32;
4828          op64_rtn = aco_opcode::ds_xor_rtn_b64;
4829          break;
4830       case nir_intrinsic_shared_atomic_exchange:
4831          op32 = aco_opcode::ds_write_b32;
4832          op64 = aco_opcode::ds_write_b64;
4833          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4834          op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4835          break;
4836       case nir_intrinsic_shared_atomic_comp_swap:
4837          op32 = aco_opcode::ds_cmpst_b32;
4838          op64 = aco_opcode::ds_cmpst_b64;
4839          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4840          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4841          num_operands = 4;
4842          break;
4843       default:
4844          unreachable("Unhandled shared atomic intrinsic");
4845    }
4846
4847    /* return the previous value if dest is ever used */
4848    bool return_previous = false;
4849    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4850       return_previous = true;
4851       break;
4852    }
4853    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4854       return_previous = true;
4855       break;
4856    }
4857
4858    aco_opcode op;
4859    if (data.size() == 1) {
4860       assert(instr->dest.ssa.bit_size == 32);
4861       op = return_previous ? op32_rtn : op32;
4862    } else {
4863       assert(instr->dest.ssa.bit_size == 64);
4864       op = return_previous ? op64_rtn : op64;
4865    }
4866
4867    if (offset > 65535) {
4868       Builder bld(ctx->program, ctx->block);
4869       address = bld.vadd32(bld.def(v1), Operand(offset), address);
4870       offset = 0;
4871    }
4872
4873    aco_ptr<DS_instruction> ds;
4874    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
4875    ds->operands[0] = Operand(address);
4876    ds->operands[1] = Operand(data);
4877    if (num_operands == 4)
4878       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
4879    ds->operands[num_operands - 1] = m;
4880    ds->offset0 = offset;
4881    if (return_previous)
4882       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
4883    ctx->block->instructions.emplace_back(std::move(ds));
4884 }
4885
4886 Temp get_scratch_resource(isel_context *ctx)
4887 {
4888    Builder bld(ctx->program, ctx->block);
4889    Temp scratch_addr = ctx->private_segment_buffer;
4890    if (ctx->stage != compute_cs)
4891       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
4892
4893    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
4894                         S_008F0C_INDEX_STRIDE(ctx->options->wave_size == 64 ? 3 : 2);;
4895
4896    if (ctx->program->chip_class >= GFX10) {
4897       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4898                    S_008F0C_OOB_SELECT(3) |
4899                    S_008F0C_RESOURCE_LEVEL(1);
4900    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
4901       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4902                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4903    }
4904
4905    /* older generations need element size = 16 bytes. element size removed in GFX9 */
4906    if (ctx->program->chip_class <= GFX8)
4907       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
4908
4909    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
4910 }
4911
4912 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4913    assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
4914    Builder bld(ctx->program, ctx->block);
4915    Temp rsrc = get_scratch_resource(ctx);
4916    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4917    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4918
4919    aco_opcode op;
4920    switch (dst.size()) {
4921       case 1:
4922          op = aco_opcode::buffer_load_dword;
4923          break;
4924       case 2:
4925          op = aco_opcode::buffer_load_dwordx2;
4926          break;
4927       case 3:
4928          op = aco_opcode::buffer_load_dwordx3;
4929          break;
4930       case 4:
4931          op = aco_opcode::buffer_load_dwordx4;
4932          break;
4933       case 6:
4934       case 8: {
4935          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4936          Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
4937                                 bld.def(v4), offset, rsrc,
4938                                 ctx->scratch_offset, 0, true);
4939          Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
4940                                                   aco_opcode::buffer_load_dwordx4,
4941                                 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
4942                                 offset, rsrc, ctx->scratch_offset, 16, true);
4943          emit_split_vector(ctx, lower, 2);
4944          elems[0] = emit_extract_vector(ctx, lower, 0, v2);
4945          elems[1] = emit_extract_vector(ctx, lower, 1, v2);
4946          if (dst.size() == 8) {
4947             emit_split_vector(ctx, upper, 2);
4948             elems[2] = emit_extract_vector(ctx, upper, 0, v2);
4949             elems[3] = emit_extract_vector(ctx, upper, 1, v2);
4950          } else {
4951             elems[2] = upper;
4952          }
4953
4954          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4955                                                                          Format::PSEUDO, dst.size() / 2, 1)};
4956          for (unsigned i = 0; i < dst.size() / 2; i++)
4957             vec->operands[i] = Operand(elems[i]);
4958          vec->definitions[0] = Definition(dst);
4959          bld.insert(std::move(vec));
4960          ctx->allocated_vec.emplace(dst.id(), elems);
4961          return;
4962       }
4963       default:
4964          unreachable("Wrong dst size for nir_intrinsic_load_scratch");
4965    }
4966
4967    bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
4968    emit_split_vector(ctx, dst, instr->num_components);
4969 }
4970
4971 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4972    assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
4973    Builder bld(ctx->program, ctx->block);
4974    Temp rsrc = get_scratch_resource(ctx);
4975    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4976    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4977
4978    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4979    unsigned writemask = nir_intrinsic_write_mask(instr);
4980
4981    while (writemask) {
4982       int start, count;
4983       u_bit_scan_consecutive_range(&writemask, &start, &count);
4984       int num_bytes = count * elem_size_bytes;
4985
4986       if (num_bytes > 16) {
4987          assert(elem_size_bytes == 8);
4988          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4989          count = 2;
4990          num_bytes = 16;
4991       }
4992
4993       // TODO: check alignment of sub-dword stores
4994       // TODO: split 3 bytes. there is no store instruction for that
4995
4996       Temp write_data;
4997       if (count != instr->num_components) {
4998          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4999          for (int i = 0; i < count; i++) {
5000             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
5001             vec->operands[i] = Operand(elem);
5002          }
5003          write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
5004          vec->definitions[0] = Definition(write_data);
5005          ctx->block->instructions.emplace_back(std::move(vec));
5006       } else {
5007          write_data = data;
5008       }
5009
5010       aco_opcode op;
5011       switch (num_bytes) {
5012          case 4:
5013             op = aco_opcode::buffer_store_dword;
5014             break;
5015          case 8:
5016             op = aco_opcode::buffer_store_dwordx2;
5017             break;
5018          case 12:
5019             op = aco_opcode::buffer_store_dwordx3;
5020             break;
5021          case 16:
5022             op = aco_opcode::buffer_store_dwordx4;
5023             break;
5024          default:
5025             unreachable("Invalid data size for nir_intrinsic_store_scratch.");
5026       }
5027
5028       bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
5029    }
5030 }
5031
5032 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
5033    uint8_t log2_ps_iter_samples;
5034    if (ctx->program->info->ps.force_persample) {
5035       log2_ps_iter_samples =
5036          util_logbase2(ctx->options->key.fs.num_samples);
5037    } else {
5038       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
5039    }
5040
5041    /* The bit pattern matches that used by fixed function fragment
5042     * processing. */
5043    static const unsigned ps_iter_masks[] = {
5044       0xffff, /* not used */
5045       0x5555,
5046       0x1111,
5047       0x0101,
5048       0x0001,
5049    };
5050    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
5051
5052    Builder bld(ctx->program, ctx->block);
5053
5054    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
5055    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
5056    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
5057    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5058    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
5059 }
5060
5061 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
5062 {
5063    Builder bld(ctx->program, ctx->block);
5064
5065    if (cluster_size == 1) {
5066       return src;
5067    } if (op == nir_op_iand && cluster_size == 4) {
5068       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
5069       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5070       return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
5071                       bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
5072    } else if (op == nir_op_ior && cluster_size == 4) {
5073       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
5074       return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
5075                       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
5076    } else if (op == nir_op_iand && cluster_size == 64) {
5077       //subgroupAnd(val) -> (exec & ~val) == 0
5078       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5079       return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
5080    } else if (op == nir_op_ior && cluster_size == 64) {
5081       //subgroupOr(val) -> (val & exec) != 0
5082       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
5083    } else if (op == nir_op_ixor && cluster_size == 64) {
5084       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5085       Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5086       tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
5087       return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5088    } else {
5089       //subgroupClustered{And,Or,Xor}(val, n) ->
5090       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5091       //cluster_offset = ~(n - 1) & lane_id
5092       //cluster_mask = ((1 << n) - 1)
5093       //subgroupClusteredAnd():
5094       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5095       //subgroupClusteredOr():
5096       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
5097       //subgroupClusteredXor():
5098       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5099       Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5100                               bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5101       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5102
5103       Temp tmp;
5104       if (op == nir_op_iand)
5105          tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5106       else
5107          tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5108
5109       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5110       tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5111       tmp = emit_extract_vector(ctx, tmp, 0, v1);
5112       if (cluster_mask != 0xffffffff)
5113          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5114
5115       Definition cmp_def = Definition();
5116       if (op == nir_op_iand) {
5117          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5118       } else if (op == nir_op_ior) {
5119          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5120       } else if (op == nir_op_ixor) {
5121          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5122                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5123          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5124       }
5125       cmp_def.setHint(vcc);
5126       return cmp_def.getTemp();
5127    }
5128 }
5129
5130 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5131 {
5132    Builder bld(ctx->program, ctx->block);
5133
5134    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5135    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5136    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5137    Temp tmp;
5138    if (op == nir_op_iand)
5139       tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5140    else
5141       tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5142
5143    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5144    Temp lo = lohi.def(0).getTemp();
5145    Temp hi = lohi.def(1).getTemp();
5146    Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5147                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5148
5149    Definition cmp_def = Definition();
5150    if (op == nir_op_iand)
5151       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5152    else if (op == nir_op_ior)
5153       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5154    else if (op == nir_op_ixor)
5155       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5156                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5157    cmp_def.setHint(vcc);
5158    return cmp_def.getTemp();
5159 }
5160
5161 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5162 {
5163    Builder bld(ctx->program, ctx->block);
5164
5165    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5166    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5167    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5168    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5169    if (op == nir_op_iand)
5170       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5171    else if (op == nir_op_ior)
5172       return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5173    else if (op == nir_op_ixor)
5174       return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5175
5176    assert(false);
5177    return Temp();
5178 }
5179
5180 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5181 {
5182    Builder bld(ctx->program, ctx->block);
5183    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5184    if (src.regClass().type() == RegType::vgpr) {
5185       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5186    } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5187       bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
5188    } else if (src.regClass() == s1) {
5189       bld.sop1(aco_opcode::s_mov_b32, dst, src);
5190    } else if (src.regClass() == s2) {
5191       bld.sop1(aco_opcode::s_mov_b64, dst, src);
5192    } else {
5193       fprintf(stderr, "Unimplemented NIR instr bit size: ");
5194       nir_print_instr(&instr->instr, stderr);
5195       fprintf(stderr, "\n");
5196    }
5197 }
5198
5199 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5200 {
5201    Builder bld(ctx->program, ctx->block);
5202    Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
5203    Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
5204
5205    /* Build DD X/Y */
5206    Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5207    Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5208    Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5209    Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5210    Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5211    Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5212
5213    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5214    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5215    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5216    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5217    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5218    Temp wqm1 = bld.tmp(v1);
5219    emit_wqm(ctx, tmp1, wqm1, true);
5220    Temp wqm2 = bld.tmp(v1);
5221    emit_wqm(ctx, tmp2, wqm2, true);
5222    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5223    return;
5224 }
5225
5226 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5227 {
5228    Builder bld(ctx->program, ctx->block);
5229    switch(instr->intrinsic) {
5230    case nir_intrinsic_load_barycentric_sample:
5231    case nir_intrinsic_load_barycentric_pixel:
5232    case nir_intrinsic_load_barycentric_centroid: {
5233       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5234       fs_input input = get_interp_input(instr->intrinsic, mode);
5235
5236       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5237       if (input == fs_input::max_inputs) {
5238          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5239                     Operand(0u), Operand(0u));
5240       } else {
5241          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5242                     ctx->fs_inputs[input],
5243                     ctx->fs_inputs[input + 1]);
5244       }
5245       emit_split_vector(ctx, dst, 2);
5246       break;
5247    }
5248    case nir_intrinsic_load_barycentric_at_sample: {
5249       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5250       switch (ctx->options->key.fs.num_samples) {
5251          case 2: sample_pos_offset += 1 << 3; break;
5252          case 4: sample_pos_offset += 3 << 3; break;
5253          case 8: sample_pos_offset += 7 << 3; break;
5254          default: break;
5255       }
5256       Temp sample_pos;
5257       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5258       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5259       if (addr.type() == RegType::sgpr) {
5260          Operand offset;
5261          if (const_addr) {
5262             sample_pos_offset += const_addr->u32 << 3;
5263             offset = Operand(sample_pos_offset);
5264          } else if (ctx->options->chip_class >= GFX9) {
5265             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5266          } else {
5267             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5268             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5269          }
5270          addr = ctx->private_segment_buffer;
5271          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
5272
5273       } else if (ctx->options->chip_class >= GFX9) {
5274          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5275          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
5276       } else {
5277          /* addr += ctx->private_segment_buffer + sample_pos_offset */
5278          Temp tmp0 = bld.tmp(s1);
5279          Temp tmp1 = bld.tmp(s1);
5280          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
5281          Definition scc_tmp = bld.def(s1, scc);
5282          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5283          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp());
5284          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5285          Temp pck0 = bld.tmp(v1);
5286          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5287          tmp1 = as_vgpr(ctx, tmp1);
5288          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5289          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5290
5291          /* sample_pos = flat_load_dwordx2 addr */
5292          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5293       }
5294
5295       /* sample_pos -= 0.5 */
5296       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5297       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5298       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5299       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5300       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5301
5302       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5303       break;
5304    }
5305    case nir_intrinsic_load_barycentric_at_offset: {
5306       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5307       RegClass rc = RegClass(offset.type(), 1);
5308       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5309       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5310       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5311       break;
5312    }
5313    case nir_intrinsic_load_front_face: {
5314       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5315                Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
5316       break;
5317    }
5318    case nir_intrinsic_load_view_index:
5319    case nir_intrinsic_load_layer_id: {
5320       if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5321          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5322          bld.copy(Definition(dst), Operand(ctx->view_index));
5323          break;
5324       }
5325
5326       unsigned idx = nir_intrinsic_base(instr);
5327       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5328                  Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
5329       break;
5330    }
5331    case nir_intrinsic_load_frag_coord: {
5332       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5333       break;
5334    }
5335    case nir_intrinsic_load_sample_pos: {
5336       Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
5337       Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
5338       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5339                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5340                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5341       break;
5342    }
5343    case nir_intrinsic_load_interpolated_input:
5344       visit_load_interpolated_input(ctx, instr);
5345       break;
5346    case nir_intrinsic_store_output:
5347       visit_store_output(ctx, instr);
5348       break;
5349    case nir_intrinsic_load_input:
5350       visit_load_input(ctx, instr);
5351       break;
5352    case nir_intrinsic_load_ubo:
5353       visit_load_ubo(ctx, instr);
5354       break;
5355    case nir_intrinsic_load_push_constant:
5356       visit_load_push_constant(ctx, instr);
5357       break;
5358    case nir_intrinsic_load_constant:
5359       visit_load_constant(ctx, instr);
5360       break;
5361    case nir_intrinsic_vulkan_resource_index:
5362       visit_load_resource(ctx, instr);
5363       break;
5364    case nir_intrinsic_discard:
5365       visit_discard(ctx, instr);
5366       break;
5367    case nir_intrinsic_discard_if:
5368       visit_discard_if(ctx, instr);
5369       break;
5370    case nir_intrinsic_load_shared:
5371       visit_load_shared(ctx, instr);
5372       break;
5373    case nir_intrinsic_store_shared:
5374       visit_store_shared(ctx, instr);
5375       break;
5376    case nir_intrinsic_shared_atomic_add:
5377    case nir_intrinsic_shared_atomic_imin:
5378    case nir_intrinsic_shared_atomic_umin:
5379    case nir_intrinsic_shared_atomic_imax:
5380    case nir_intrinsic_shared_atomic_umax:
5381    case nir_intrinsic_shared_atomic_and:
5382    case nir_intrinsic_shared_atomic_or:
5383    case nir_intrinsic_shared_atomic_xor:
5384    case nir_intrinsic_shared_atomic_exchange:
5385    case nir_intrinsic_shared_atomic_comp_swap:
5386       visit_shared_atomic(ctx, instr);
5387       break;
5388    case nir_intrinsic_image_deref_load:
5389       visit_image_load(ctx, instr);
5390       break;
5391    case nir_intrinsic_image_deref_store:
5392       visit_image_store(ctx, instr);
5393       break;
5394    case nir_intrinsic_image_deref_atomic_add:
5395    case nir_intrinsic_image_deref_atomic_umin:
5396    case nir_intrinsic_image_deref_atomic_imin:
5397    case nir_intrinsic_image_deref_atomic_umax:
5398    case nir_intrinsic_image_deref_atomic_imax:
5399    case nir_intrinsic_image_deref_atomic_and:
5400    case nir_intrinsic_image_deref_atomic_or:
5401    case nir_intrinsic_image_deref_atomic_xor:
5402    case nir_intrinsic_image_deref_atomic_exchange:
5403    case nir_intrinsic_image_deref_atomic_comp_swap:
5404       visit_image_atomic(ctx, instr);
5405       break;
5406    case nir_intrinsic_image_deref_size:
5407       visit_image_size(ctx, instr);
5408       break;
5409    case nir_intrinsic_load_ssbo:
5410       visit_load_ssbo(ctx, instr);
5411       break;
5412    case nir_intrinsic_store_ssbo:
5413       visit_store_ssbo(ctx, instr);
5414       break;
5415    case nir_intrinsic_load_global:
5416       visit_load_global(ctx, instr);
5417       break;
5418    case nir_intrinsic_store_global:
5419       visit_store_global(ctx, instr);
5420       break;
5421    case nir_intrinsic_ssbo_atomic_add:
5422    case nir_intrinsic_ssbo_atomic_imin:
5423    case nir_intrinsic_ssbo_atomic_umin:
5424    case nir_intrinsic_ssbo_atomic_imax:
5425    case nir_intrinsic_ssbo_atomic_umax:
5426    case nir_intrinsic_ssbo_atomic_and:
5427    case nir_intrinsic_ssbo_atomic_or:
5428    case nir_intrinsic_ssbo_atomic_xor:
5429    case nir_intrinsic_ssbo_atomic_exchange:
5430    case nir_intrinsic_ssbo_atomic_comp_swap:
5431       visit_atomic_ssbo(ctx, instr);
5432       break;
5433    case nir_intrinsic_load_scratch:
5434       visit_load_scratch(ctx, instr);
5435       break;
5436    case nir_intrinsic_store_scratch:
5437       visit_store_scratch(ctx, instr);
5438       break;
5439    case nir_intrinsic_get_buffer_size:
5440       visit_get_buffer_size(ctx, instr);
5441       break;
5442    case nir_intrinsic_barrier: {
5443       unsigned* bsize = ctx->program->info->cs.block_size;
5444       unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5445       if (workgroup_size > 64)
5446          bld.sopp(aco_opcode::s_barrier);
5447       break;
5448    }
5449    case nir_intrinsic_group_memory_barrier:
5450    case nir_intrinsic_memory_barrier:
5451    case nir_intrinsic_memory_barrier_atomic_counter:
5452    case nir_intrinsic_memory_barrier_buffer:
5453    case nir_intrinsic_memory_barrier_image:
5454    case nir_intrinsic_memory_barrier_shared:
5455       emit_memory_barrier(ctx, instr);
5456       break;
5457    case nir_intrinsic_load_num_work_groups:
5458    case nir_intrinsic_load_work_group_id:
5459    case nir_intrinsic_load_local_invocation_id: {
5460       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5461       Temp* ids;
5462       if (instr->intrinsic == nir_intrinsic_load_num_work_groups)
5463          ids = ctx->num_workgroups;
5464       else if (instr->intrinsic == nir_intrinsic_load_work_group_id)
5465          ids = ctx->workgroup_ids;
5466       else
5467          ids = ctx->local_invocation_ids;
5468       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5469                  ids[0].id() ? Operand(ids[0]) : Operand(1u),
5470                  ids[1].id() ? Operand(ids[1]) : Operand(1u),
5471                  ids[2].id() ? Operand(ids[2]) : Operand(1u));
5472       emit_split_vector(ctx, dst, 3);
5473       break;
5474    }
5475    case nir_intrinsic_load_local_invocation_index: {
5476       Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5477                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5478       Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5479       bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5480       break;
5481    }
5482    case nir_intrinsic_load_subgroup_id: {
5483       if (ctx->stage == compute_cs) {
5484          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5485          bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
5486       } else {
5487          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5488       }
5489       break;
5490    }
5491    case nir_intrinsic_load_subgroup_invocation: {
5492       bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5493                bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5494       break;
5495    }
5496    case nir_intrinsic_load_num_subgroups: {
5497       if (ctx->stage == compute_cs)
5498          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
5499       else
5500          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5501       break;
5502    }
5503    case nir_intrinsic_ballot: {
5504       Definition tmp = bld.def(s2);
5505       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5506       if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
5507          bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5508       } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
5509          bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
5510       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5511          bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5512       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5513          bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5514       } else {
5515          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5516          nir_print_instr(&instr->instr, stderr);
5517          fprintf(stderr, "\n");
5518       }
5519       emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5520       break;
5521    }
5522    case nir_intrinsic_shuffle: {
5523       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5524       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5525          emit_uniform_subgroup(ctx, instr, src);
5526       } else {
5527          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5528          assert(tid.regClass() == v1);
5529          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5530          if (src.regClass() == v1) {
5531             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5532             emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
5533          } else if (src.regClass() == v2) {
5534             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5535
5536             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5537             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5538             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
5539             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
5540             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5541             emit_split_vector(ctx, dst, 2);
5542          } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5543             Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5544             tmp = emit_extract_vector(ctx, tmp, 0, v1);
5545             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5546             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5547          } else {
5548             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5549             nir_print_instr(&instr->instr, stderr);
5550             fprintf(stderr, "\n");
5551          }
5552       }
5553       break;
5554    }
5555    case nir_intrinsic_load_sample_id: {
5556       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5557                ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
5558       break;
5559    }
5560    case nir_intrinsic_load_sample_mask_in: {
5561       visit_load_sample_mask_in(ctx, instr);
5562       break;
5563    }
5564    case nir_intrinsic_read_first_invocation: {
5565       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5566       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5567       if (src.regClass() == v1) {
5568          emit_wqm(ctx,
5569                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5570                   dst);
5571       } else if (src.regClass() == v2) {
5572          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5573          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5574          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5575          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5576          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5577          emit_split_vector(ctx, dst, 2);
5578       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5579          emit_wqm(ctx,
5580                   bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5581                            bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
5582                   dst);
5583       } else if (src.regClass() == s1) {
5584          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5585       } else if (src.regClass() == s2) {
5586          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5587       } else {
5588          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5589          nir_print_instr(&instr->instr, stderr);
5590          fprintf(stderr, "\n");
5591       }
5592       break;
5593    }
5594    case nir_intrinsic_read_invocation: {
5595       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5596       Temp lane = get_ssa_temp(ctx, instr->src[1].ssa);
5597       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5598       assert(lane.regClass() == s1);
5599       if (src.regClass() == v1) {
5600          emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst);
5601       } else if (src.regClass() == v2) {
5602          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5603          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5604          lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane));
5605          hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane));
5606          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5607          emit_split_vector(ctx, dst, 2);
5608       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5609          emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst);
5610       } else if (src.regClass() == s1) {
5611          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5612       } else if (src.regClass() == s2) {
5613          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5614       } else {
5615          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5616          nir_print_instr(&instr->instr, stderr);
5617          fprintf(stderr, "\n");
5618       }
5619       break;
5620    }
5621    case nir_intrinsic_vote_all: {
5622       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5623       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5624       assert(src.regClass() == s2);
5625       assert(dst.regClass() == s1);
5626
5627       Definition tmp = bld.def(s1);
5628       bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
5629                bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
5630                Operand(exec, s2));
5631       emit_wqm(ctx, tmp.getTemp(), dst);
5632       break;
5633    }
5634    case nir_intrinsic_vote_any: {
5635       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5636       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5637       assert(src.regClass() == s2);
5638       assert(dst.regClass() == s1);
5639
5640       Definition tmp = bld.def(s1);
5641       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
5642       emit_wqm(ctx, tmp.getTemp(), dst);
5643       break;
5644    }
5645    case nir_intrinsic_reduce:
5646    case nir_intrinsic_inclusive_scan:
5647    case nir_intrinsic_exclusive_scan: {
5648       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5649       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5650       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5651       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5652          nir_intrinsic_cluster_size(instr) : 0;
5653       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5654
5655       if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5656          emit_uniform_subgroup(ctx, instr, src);
5657       } else if (instr->dest.ssa.bit_size == 1) {
5658          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5659             op = nir_op_iand;
5660          else if (op == nir_op_iadd)
5661             op = nir_op_ixor;
5662          else if (op == nir_op_umax || op == nir_op_imax)
5663             op = nir_op_ior;
5664          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5665
5666          switch (instr->intrinsic) {
5667          case nir_intrinsic_reduce:
5668             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5669             break;
5670          case nir_intrinsic_exclusive_scan:
5671             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5672             break;
5673          case nir_intrinsic_inclusive_scan:
5674             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5675             break;
5676          default:
5677             assert(false);
5678          }
5679       } else if (cluster_size == 1) {
5680          bld.copy(Definition(dst), src);
5681       } else {
5682          src = as_vgpr(ctx, src);
5683
5684          ReduceOp reduce_op;
5685          switch (op) {
5686          #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5687             CASE(iadd)
5688             CASE(imul)
5689             CASE(fadd)
5690             CASE(fmul)
5691             CASE(imin)
5692             CASE(umin)
5693             CASE(fmin)
5694             CASE(imax)
5695             CASE(umax)
5696             CASE(fmax)
5697             CASE(iand)
5698             CASE(ior)
5699             CASE(ixor)
5700             default:
5701                unreachable("unknown reduction op");
5702          #undef CASE
5703          }
5704
5705          aco_opcode aco_op;
5706          switch (instr->intrinsic) {
5707             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5708             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5709             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5710             default:
5711                unreachable("unknown reduce intrinsic");
5712          }
5713
5714          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5715          reduce->operands[0] = Operand(src);
5716          // filled in by aco_reduce_assign.cpp, used internally as part of the
5717          // reduce sequence
5718          assert(dst.size() == 1 || dst.size() == 2);
5719          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5720          reduce->operands[2] = Operand(v1.as_linear());
5721
5722          Temp tmp_dst = bld.tmp(dst.regClass());
5723          reduce->definitions[0] = Definition(tmp_dst);
5724          reduce->definitions[1] = bld.def(s2); // used internally
5725          reduce->definitions[2] = Definition();
5726          reduce->definitions[3] = Definition(scc, s1);
5727          reduce->definitions[4] = Definition();
5728          reduce->reduce_op = reduce_op;
5729          reduce->cluster_size = cluster_size;
5730          ctx->block->instructions.emplace_back(std::move(reduce));
5731
5732          emit_wqm(ctx, tmp_dst, dst);
5733       }
5734       break;
5735    }
5736    case nir_intrinsic_quad_broadcast: {
5737       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5738       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5739          emit_uniform_subgroup(ctx, instr, src);
5740       } else {
5741          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5742          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5743          if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5744             uint32_t half_mask = 0x11111111u << lane;
5745             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5746             Temp tmp = bld.tmp(s2);
5747             bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5748                      bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5749                               bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5750             emit_wqm(ctx, tmp, dst);
5751          } else if (instr->dest.ssa.bit_size == 32) {
5752             emit_wqm(ctx,
5753                      bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5754                                   dpp_quad_perm(lane, lane, lane, lane)),
5755                      dst);
5756          } else if (instr->dest.ssa.bit_size == 64) {
5757             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5758             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5759             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5760             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5761             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5762             emit_split_vector(ctx, dst, 2);
5763          } else {
5764             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5765             nir_print_instr(&instr->instr, stderr);
5766             fprintf(stderr, "\n");
5767          }
5768       }
5769       break;
5770    }
5771    case nir_intrinsic_quad_swap_horizontal:
5772    case nir_intrinsic_quad_swap_vertical:
5773    case nir_intrinsic_quad_swap_diagonal:
5774    case nir_intrinsic_quad_swizzle_amd: {
5775       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5776       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5777          emit_uniform_subgroup(ctx, instr, src);
5778          break;
5779       }
5780       uint16_t dpp_ctrl = 0;
5781       switch (instr->intrinsic) {
5782       case nir_intrinsic_quad_swap_horizontal:
5783          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5784          break;
5785       case nir_intrinsic_quad_swap_vertical:
5786          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5787          break;
5788       case nir_intrinsic_quad_swap_diagonal:
5789          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5790          break;
5791       case nir_intrinsic_quad_swizzle_amd: {
5792          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5793          break;
5794       }
5795       default:
5796          break;
5797       }
5798
5799       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5800       if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5801          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5802          src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5803          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5804          emit_wqm(ctx, tmp, dst);
5805       } else if (instr->dest.ssa.bit_size == 32) {
5806          Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5807          emit_wqm(ctx, tmp, dst);
5808       } else if (instr->dest.ssa.bit_size == 64) {
5809          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5810          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5811          lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5812          hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5813          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5814          emit_split_vector(ctx, dst, 2);
5815       } else {
5816          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5817          nir_print_instr(&instr->instr, stderr);
5818          fprintf(stderr, "\n");
5819       }
5820       break;
5821    }
5822    case nir_intrinsic_masked_swizzle_amd: {
5823       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5824       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5825          emit_uniform_subgroup(ctx, instr, src);
5826          break;
5827       }
5828       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5829       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5830       if (dst.regClass() == v1) {
5831          emit_wqm(ctx,
5832                   bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5833                   dst);
5834       } else if (dst.regClass() == v2) {
5835          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5836          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5837          lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5838          hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5839          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5840          emit_split_vector(ctx, dst, 2);
5841       } else {
5842          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5843          nir_print_instr(&instr->instr, stderr);
5844          fprintf(stderr, "\n");
5845       }
5846       break;
5847    }
5848    case nir_intrinsic_write_invocation_amd: {
5849       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5850       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
5851       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
5852       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5853       if (dst.regClass() == v1) {
5854          /* src2 is ignored for writelane. RA assigns the same reg for dst */
5855          emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
5856       } else if (dst.regClass() == v2) {
5857          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
5858          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
5859          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
5860          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
5861          Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
5862          Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
5863          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5864          emit_split_vector(ctx, dst, 2);
5865       } else {
5866          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5867          nir_print_instr(&instr->instr, stderr);
5868          fprintf(stderr, "\n");
5869       }
5870       break;
5871    }
5872    case nir_intrinsic_mbcnt_amd: {
5873       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5874       RegClass rc = RegClass(src.type(), 1);
5875       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
5876       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
5877       Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
5878       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5879       Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
5880       emit_wqm(ctx, wqm_tmp, dst);
5881       break;
5882    }
5883    case nir_intrinsic_load_helper_invocation: {
5884       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5885       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
5886       ctx->block->kind |= block_kind_needs_lowering;
5887       ctx->program->needs_exact = true;
5888       break;
5889    }
5890    case nir_intrinsic_is_helper_invocation: {
5891       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5892       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
5893       ctx->block->kind |= block_kind_needs_lowering;
5894       ctx->program->needs_exact = true;
5895       break;
5896    }
5897    case nir_intrinsic_demote:
5898       bld.pseudo(aco_opcode::p_demote_to_helper);
5899       ctx->block->kind |= block_kind_uses_demote;
5900       ctx->program->needs_exact = true;
5901       break;
5902    case nir_intrinsic_demote_if: {
5903       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
5904                            as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false),
5905                            Operand(exec, s2));
5906       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
5907       ctx->block->kind |= block_kind_uses_demote;
5908       ctx->program->needs_exact = true;
5909       break;
5910    }
5911    case nir_intrinsic_first_invocation: {
5912       emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
5913                get_ssa_temp(ctx, &instr->dest.ssa));
5914       break;
5915    }
5916    case nir_intrinsic_shader_clock:
5917       bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
5918       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
5919       break;
5920    case nir_intrinsic_load_vertex_id_zero_base: {
5921       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5922       bld.copy(Definition(dst), ctx->vertex_id);
5923       break;
5924    }
5925    case nir_intrinsic_load_first_vertex: {
5926       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5927       bld.copy(Definition(dst), ctx->base_vertex);
5928       break;
5929    }
5930    case nir_intrinsic_load_base_instance: {
5931       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5932       bld.copy(Definition(dst), ctx->start_instance);
5933       break;
5934    }
5935    case nir_intrinsic_load_instance_id: {
5936       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5937       bld.copy(Definition(dst), ctx->instance_id);
5938       break;
5939    }
5940    case nir_intrinsic_load_draw_id: {
5941       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5942       bld.copy(Definition(dst), ctx->draw_id);
5943       break;
5944    }
5945    default:
5946       fprintf(stderr, "Unimplemented intrinsic instr: ");
5947       nir_print_instr(&instr->instr, stderr);
5948       fprintf(stderr, "\n");
5949       abort();
5950
5951       break;
5952    }
5953 }
5954
5955
5956 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
5957                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
5958                     enum glsl_base_type *stype)
5959 {
5960    nir_deref_instr *texture_deref_instr = NULL;
5961    nir_deref_instr *sampler_deref_instr = NULL;
5962    int plane = -1;
5963
5964    for (unsigned i = 0; i < instr->num_srcs; i++) {
5965       switch (instr->src[i].src_type) {
5966       case nir_tex_src_texture_deref:
5967          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
5968          break;
5969       case nir_tex_src_sampler_deref:
5970          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
5971          break;
5972       case nir_tex_src_plane:
5973          plane = nir_src_as_int(instr->src[i].src);
5974          break;
5975       default:
5976          break;
5977       }
5978    }
5979
5980    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
5981
5982    if (!sampler_deref_instr)
5983       sampler_deref_instr = texture_deref_instr;
5984
5985    if (plane >= 0) {
5986       assert(instr->op != nir_texop_txf_ms &&
5987              instr->op != nir_texop_samples_identical);
5988       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
5989       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
5990    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
5991       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
5992    } else {
5993       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
5994    }
5995    if (samp_ptr) {
5996       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
5997       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
5998          fprintf(stderr, "Unimplemented sampler descriptor: ");
5999          nir_print_instr(&instr->instr, stderr);
6000          fprintf(stderr, "\n");
6001          abort();
6002          // TODO: build samp_ptr = and(samp_ptr, res_ptr)
6003       }
6004    }
6005    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
6006                      instr->op == nir_texop_samples_identical))
6007       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
6008 }
6009
6010 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
6011                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
6012 {
6013    Builder bld(ctx->program, ctx->block);
6014
6015    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
6016    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
6017    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
6018
6019    Operand neg_one(0xbf800000u);
6020    Operand one(0x3f800000u);
6021    Operand two(0x40000000u);
6022    Operand four(0x40800000u);
6023
6024    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
6025    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
6026    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
6027
6028    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
6029    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
6030    is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
6031    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
6032
6033    // select sc
6034    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
6035    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
6036                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
6037                        one, is_ma_y);
6038    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6039
6040    // select tc
6041    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
6042    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
6043    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
6044
6045    // select ma
6046    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6047                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
6048                   deriv_z, is_ma_z);
6049    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
6050    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
6051 }
6052
6053 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
6054 {
6055    Builder bld(ctx->program, ctx->block);
6056    Temp coord_args[4], ma, tc, sc, id;
6057    for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
6058       coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
6059
6060    if (is_array) {
6061       coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
6062
6063       // see comment in ac_prepare_cube_coords()
6064       if (ctx->options->chip_class <= GFX8)
6065          coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
6066    }
6067
6068    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6069
6070    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
6071    vop3a->operands[0] = Operand(ma);
6072    vop3a->abs[0] = true;
6073    Temp invma = bld.tmp(v1);
6074    vop3a->definitions[0] = Definition(invma);
6075    ctx->block->instructions.emplace_back(std::move(vop3a));
6076
6077    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6078    if (!is_deriv)
6079       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
6080
6081    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6082    if (!is_deriv)
6083       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6084
6085    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6086
6087    if (is_deriv) {
6088       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6089       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6090
6091       for (unsigned i = 0; i < 2; i++) {
6092          // see comment in ac_prepare_cube_coords()
6093          Temp deriv_ma;
6094          Temp deriv_sc, deriv_tc;
6095          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6096                            &deriv_ma, &deriv_sc, &deriv_tc);
6097
6098          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6099
6100          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6101                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6102                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6103          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6104                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6105                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6106          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6107       }
6108
6109       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6110       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6111    }
6112
6113    if (is_array)
6114       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6115    *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6116
6117 }
6118
6119 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6120 {
6121    Temp coord_vec[3];
6122    for (unsigned i = 0; i < coords.size(); i++)
6123       coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6124
6125    Builder bld(ctx->program, ctx->block);
6126    coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6127
6128    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6129    for (unsigned i = 0; i < coords.size(); i++)
6130       vec->operands[i] = Operand(coord_vec[i]);
6131    Temp res = bld.tmp(RegType::vgpr, coords.size());
6132    vec->definitions[0] = Definition(res);
6133    ctx->block->instructions.emplace_back(std::move(vec));
6134    return res;
6135 }
6136
6137 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6138 {
6139    if (vec->parent_instr->type != nir_instr_type_alu)
6140       return;
6141    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6142    if (vec_instr->op != nir_op_vec(vec->num_components))
6143       return;
6144
6145    for (unsigned i = 0; i < vec->num_components; i++) {
6146       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6147               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6148    }
6149 }
6150
6151 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6152 {
6153    Builder bld(ctx->program, ctx->block);
6154    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6155         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6156    Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6157         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6158    nir_const_value *sample_index_cv = NULL;
6159    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6160    enum glsl_base_type stype;
6161    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6162
6163    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6164                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6165    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6166                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6167
6168    for (unsigned i = 0; i < instr->num_srcs; i++) {
6169       switch (instr->src[i].src_type) {
6170       case nir_tex_src_coord:
6171          coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6172          break;
6173       case nir_tex_src_bias:
6174          if (instr->op == nir_texop_txb) {
6175             bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6176             has_bias = true;
6177          }
6178          break;
6179       case nir_tex_src_lod: {
6180          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6181
6182          if (val && val->f32 <= 0.0) {
6183             level_zero = true;
6184          } else {
6185             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6186             has_lod = true;
6187          }
6188          break;
6189       }
6190       case nir_tex_src_comparator:
6191          if (instr->is_shadow) {
6192             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6193             has_compare = true;
6194          }
6195          break;
6196       case nir_tex_src_offset:
6197          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6198          get_const_vec(instr->src[i].src.ssa, const_offset);
6199          has_offset = true;
6200          break;
6201       case nir_tex_src_ddx:
6202          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6203          has_ddx = true;
6204          break;
6205       case nir_tex_src_ddy:
6206          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6207          has_ddy = true;
6208          break;
6209       case nir_tex_src_ms_index:
6210          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6211          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6212          has_sample_index = true;
6213          break;
6214       case nir_tex_src_texture_offset:
6215       case nir_tex_src_sampler_offset:
6216       default:
6217          break;
6218       }
6219    }
6220 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6221    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6222       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6223
6224    if (instr->op == nir_texop_texture_samples) {
6225       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6226
6227       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6228       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6229       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6230       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6231
6232       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6233                samples, Operand(1u), bld.scc(is_msaa));
6234       return;
6235    }
6236
6237    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6238       aco_ptr<Instruction> tmp_instr;
6239       Temp acc, pack = Temp();
6240
6241       uint32_t pack_const = 0;
6242       for (unsigned i = 0; i < offset.size(); i++) {
6243          if (!const_offset[i])
6244             continue;
6245          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6246       }
6247
6248       if (offset.type() == RegType::sgpr) {
6249          for (unsigned i = 0; i < offset.size(); i++) {
6250             if (const_offset[i])
6251                continue;
6252
6253             acc = emit_extract_vector(ctx, offset, i, s1);
6254             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6255
6256             if (i) {
6257                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6258             }
6259
6260             if (pack == Temp()) {
6261                pack = acc;
6262             } else {
6263                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6264             }
6265          }
6266
6267          if (pack_const && pack != Temp())
6268             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6269       } else {
6270          for (unsigned i = 0; i < offset.size(); i++) {
6271             if (const_offset[i])
6272                continue;
6273
6274             acc = emit_extract_vector(ctx, offset, i, v1);
6275             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6276
6277             if (i) {
6278                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6279             }
6280
6281             if (pack == Temp()) {
6282                pack = acc;
6283             } else {
6284                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6285             }
6286          }
6287
6288          if (pack_const && pack != Temp())
6289             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6290       }
6291       if (pack_const && pack == Temp())
6292          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6293       else if (pack == Temp())
6294          has_offset = false;
6295       else
6296          offset = pack;
6297    }
6298
6299    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6300       prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6301
6302    /* pack derivatives */
6303    if (has_ddx || has_ddy) {
6304       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6305          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6306                              ddx, Operand(0u), ddy, Operand(0u));
6307       } else {
6308          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6309       }
6310       has_derivs = true;
6311    }
6312
6313    if (instr->coord_components > 1 &&
6314        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6315        instr->is_array &&
6316        instr->op != nir_texop_txf)
6317       coords = apply_round_slice(ctx, coords, 1);
6318
6319    if (instr->coord_components > 2 &&
6320       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6321        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6322        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6323        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6324        instr->is_array &&
6325        instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6326       coords = apply_round_slice(ctx, coords, 2);
6327
6328    if (ctx->options->chip_class == GFX9 &&
6329        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6330        instr->op != nir_texop_lod && instr->coord_components) {
6331       assert(coords.size() > 0 && coords.size() < 3);
6332
6333       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6334       vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6335       vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6336       if (coords.size() > 1)
6337          vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6338       coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6339       vec->definitions[0] = Definition(coords);
6340       ctx->block->instructions.emplace_back(std::move(vec));
6341    }
6342
6343    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6344
6345    if (instr->op == nir_texop_samples_identical)
6346       resource = fmask_ptr;
6347
6348    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6349              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6350             instr->op != nir_texop_txs) {
6351       assert(has_sample_index);
6352       Operand op(sample_index);
6353       if (sample_index_cv)
6354          op = Operand(sample_index_cv->u32);
6355       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6356    }
6357
6358    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6359       Temp split_coords[coords.size()];
6360       emit_split_vector(ctx, coords, coords.size());
6361       for (unsigned i = 0; i < coords.size(); i++)
6362          split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6363
6364       unsigned i = 0;
6365       for (; i < std::min(offset.size(), instr->coord_components); i++) {
6366          Temp off = emit_extract_vector(ctx, offset, i, v1);
6367          split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6368       }
6369
6370       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6371       for (unsigned i = 0; i < coords.size(); i++)
6372          vec->operands[i] = Operand(split_coords[i]);
6373       coords = bld.tmp(coords.regClass());
6374       vec->definitions[0] = Definition(coords);
6375       ctx->block->instructions.emplace_back(std::move(vec));
6376
6377       has_offset = false;
6378    }
6379
6380    /* Build tex instruction */
6381    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6382    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6383                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6384                   : 0;
6385    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6386    Temp tmp_dst = dst;
6387
6388    /* gather4 selects the component by dmask and always returns vec4 */
6389    if (instr->op == nir_texop_tg4) {
6390       assert(instr->dest.ssa.num_components == 4);
6391       if (instr->is_shadow)
6392          dmask = 1;
6393       else
6394          dmask = 1 << instr->component;
6395       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6396          tmp_dst = bld.tmp(v4);
6397    } else if (instr->op == nir_texop_samples_identical) {
6398       tmp_dst = bld.tmp(v1);
6399    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6400       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6401    }
6402
6403    aco_ptr<MIMG_instruction> tex;
6404    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6405       if (!has_lod)
6406          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6407
6408       bool div_by_6 = instr->op == nir_texop_txs &&
6409                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6410                       instr->is_array &&
6411                       (dmask & (1 << 2));
6412       if (tmp_dst.id() == dst.id() && div_by_6)
6413          tmp_dst = bld.tmp(tmp_dst.regClass());
6414
6415       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6416       tex->operands[0] = Operand(as_vgpr(ctx,lod));
6417       tex->operands[1] = Operand(resource);
6418       if (ctx->options->chip_class == GFX9 &&
6419           instr->op == nir_texop_txs &&
6420           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6421           instr->is_array) {
6422          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6423       } else if (instr->op == nir_texop_query_levels) {
6424          tex->dmask = 1 << 3;
6425       } else {
6426          tex->dmask = dmask;
6427       }
6428       tex->da = da;
6429       tex->definitions[0] = Definition(tmp_dst);
6430       tex->dim = dim;
6431       tex->can_reorder = true;
6432       ctx->block->instructions.emplace_back(std::move(tex));
6433
6434       if (div_by_6) {
6435          /* divide 3rd value by 6 by multiplying with magic number */
6436          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6437          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6438          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6439          assert(instr->dest.ssa.num_components == 3);
6440          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6441          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6442                               emit_extract_vector(ctx, tmp_dst, 0, v1),
6443                               emit_extract_vector(ctx, tmp_dst, 1, v1),
6444                               by_6);
6445
6446       }
6447
6448       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6449       return;
6450    }
6451
6452    Temp tg4_compare_cube_wa64 = Temp();
6453
6454    if (tg4_integer_workarounds) {
6455       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6456       tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6457       tex->operands[1] = Operand(resource);
6458       tex->dim = dim;
6459       tex->dmask = 0x3;
6460       tex->da = da;
6461       Temp size = bld.tmp(v2);
6462       tex->definitions[0] = Definition(size);
6463       tex->can_reorder = true;
6464       ctx->block->instructions.emplace_back(std::move(tex));
6465       emit_split_vector(ctx, size, size.size());
6466
6467       Temp half_texel[2];
6468       for (unsigned i = 0; i < 2; i++) {
6469          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6470          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6471          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6472          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6473       }
6474
6475       Temp orig_coords[2] = {
6476          emit_extract_vector(ctx, coords, 0, v1),
6477          emit_extract_vector(ctx, coords, 1, v1)};
6478       Temp new_coords[2] = {
6479          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6480          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6481       };
6482
6483       if (tg4_integer_cube_workaround) {
6484          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6485          Temp desc[resource.size()];
6486          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6487                                                                            Format::PSEUDO, 1, resource.size())};
6488          split->operands[0] = Operand(resource);
6489          for (unsigned i = 0; i < resource.size(); i++) {
6490             desc[i] = bld.tmp(s1);
6491             split->definitions[i] = Definition(desc[i]);
6492          }
6493          ctx->block->instructions.emplace_back(std::move(split));
6494
6495          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6496          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6497                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6498
6499          Temp nfmt;
6500          if (stype == GLSL_TYPE_UINT) {
6501             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6502                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6503                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6504                             bld.scc(compare_cube_wa));
6505          } else {
6506             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6507                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6508                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6509                             bld.scc(compare_cube_wa));
6510          }
6511          tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true);
6512          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6513
6514          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6515                             Operand((uint32_t)C_008F14_NUM_FORMAT));
6516          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6517
6518          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6519                                                                          Format::PSEUDO, resource.size(), 1)};
6520          for (unsigned i = 0; i < resource.size(); i++)
6521             vec->operands[i] = Operand(desc[i]);
6522          resource = bld.tmp(resource.regClass());
6523          vec->definitions[0] = Definition(resource);
6524          ctx->block->instructions.emplace_back(std::move(vec));
6525
6526          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6527                                   new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6528          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6529                                   new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6530       }
6531
6532       if (coords.size() == 3) {
6533          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6534                              new_coords[0], new_coords[1],
6535                              emit_extract_vector(ctx, coords, 2, v1));
6536       } else {
6537          assert(coords.size() == 2);
6538          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6539                              new_coords[0], new_coords[1]);
6540       }
6541    }
6542
6543    if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6544        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6545        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6546       coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
6547
6548    std::vector<Operand> args;
6549    if (has_offset)
6550       args.emplace_back(Operand(offset));
6551    if (has_bias)
6552       args.emplace_back(Operand(bias));
6553    if (has_compare)
6554       args.emplace_back(Operand(compare));
6555    if (has_derivs)
6556       args.emplace_back(Operand(derivs));
6557    args.emplace_back(Operand(coords));
6558    if (has_sample_index)
6559       args.emplace_back(Operand(sample_index));
6560    if (has_lod)
6561       args.emplace_back(lod);
6562
6563    Operand arg;
6564    if (args.size() > 1) {
6565       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6566       unsigned size = 0;
6567       for (unsigned i = 0; i < args.size(); i++) {
6568          size += args[i].size();
6569          vec->operands[i] = args[i];
6570       }
6571       RegClass rc = RegClass(RegType::vgpr, size);
6572       Temp tmp = bld.tmp(rc);
6573       vec->definitions[0] = Definition(tmp);
6574       ctx->block->instructions.emplace_back(std::move(vec));
6575       arg = Operand(tmp);
6576    } else {
6577       assert(args[0].isTemp());
6578       arg = Operand(as_vgpr(ctx, args[0].getTemp()));
6579    }
6580
6581    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6582       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6583
6584       assert(coords.size() == 1);
6585       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6586       aco_opcode op;
6587       switch (last_bit) {
6588       case 1:
6589          op = aco_opcode::buffer_load_format_x; break;
6590       case 2:
6591          op = aco_opcode::buffer_load_format_xy; break;
6592       case 3:
6593          op = aco_opcode::buffer_load_format_xyz; break;
6594       case 4:
6595          op = aco_opcode::buffer_load_format_xyzw; break;
6596       default:
6597          unreachable("Tex instruction loads more than 4 components.");
6598       }
6599
6600       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6601       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6602          tmp_dst = dst;
6603       else
6604          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6605
6606       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6607       mubuf->operands[0] = Operand(coords);
6608       mubuf->operands[1] = Operand(resource);
6609       mubuf->operands[2] = Operand((uint32_t) 0);
6610       mubuf->definitions[0] = Definition(tmp_dst);
6611       mubuf->idxen = true;
6612       mubuf->can_reorder = true;
6613       ctx->block->instructions.emplace_back(std::move(mubuf));
6614
6615       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6616       return;
6617    }
6618
6619
6620    if (instr->op == nir_texop_txf ||
6621        instr->op == nir_texop_txf_ms ||
6622        instr->op == nir_texop_samples_identical) {
6623       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6624       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6625       tex->operands[0] = Operand(arg);
6626       tex->operands[1] = Operand(resource);
6627       tex->dim = dim;
6628       tex->dmask = dmask;
6629       tex->unrm = true;
6630       tex->da = da;
6631       tex->definitions[0] = Definition(tmp_dst);
6632       tex->can_reorder = true;
6633       ctx->block->instructions.emplace_back(std::move(tex));
6634
6635       if (instr->op == nir_texop_samples_identical) {
6636          assert(dmask == 1 && dst.regClass() == v1);
6637          assert(dst.id() != tmp_dst.id());
6638
6639          Temp tmp = bld.tmp(s2);
6640          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6641          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6642
6643       } else {
6644          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6645       }
6646       return;
6647    }
6648
6649    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6650    aco_opcode opcode = aco_opcode::image_sample;
6651    if (has_offset) { /* image_sample_*_o */
6652       if (has_compare) {
6653          opcode = aco_opcode::image_sample_c_o;
6654          if (has_derivs)
6655             opcode = aco_opcode::image_sample_c_d_o;
6656          if (has_bias)
6657             opcode = aco_opcode::image_sample_c_b_o;
6658          if (level_zero)
6659             opcode = aco_opcode::image_sample_c_lz_o;
6660          if (has_lod)
6661             opcode = aco_opcode::image_sample_c_l_o;
6662       } else {
6663          opcode = aco_opcode::image_sample_o;
6664          if (has_derivs)
6665             opcode = aco_opcode::image_sample_d_o;
6666          if (has_bias)
6667             opcode = aco_opcode::image_sample_b_o;
6668          if (level_zero)
6669             opcode = aco_opcode::image_sample_lz_o;
6670          if (has_lod)
6671             opcode = aco_opcode::image_sample_l_o;
6672       }
6673    } else { /* no offset */
6674       if (has_compare) {
6675          opcode = aco_opcode::image_sample_c;
6676          if (has_derivs)
6677             opcode = aco_opcode::image_sample_c_d;
6678          if (has_bias)
6679             opcode = aco_opcode::image_sample_c_b;
6680          if (level_zero)
6681             opcode = aco_opcode::image_sample_c_lz;
6682          if (has_lod)
6683             opcode = aco_opcode::image_sample_c_l;
6684       } else {
6685          opcode = aco_opcode::image_sample;
6686          if (has_derivs)
6687             opcode = aco_opcode::image_sample_d;
6688          if (has_bias)
6689             opcode = aco_opcode::image_sample_b;
6690          if (level_zero)
6691             opcode = aco_opcode::image_sample_lz;
6692          if (has_lod)
6693             opcode = aco_opcode::image_sample_l;
6694       }
6695    }
6696
6697    if (instr->op == nir_texop_tg4) {
6698       if (has_offset) {
6699          opcode = aco_opcode::image_gather4_lz_o;
6700          if (has_compare)
6701             opcode = aco_opcode::image_gather4_c_lz_o;
6702       } else {
6703          opcode = aco_opcode::image_gather4_lz;
6704          if (has_compare)
6705             opcode = aco_opcode::image_gather4_c_lz;
6706       }
6707    } else if (instr->op == nir_texop_lod) {
6708       opcode = aco_opcode::image_get_lod;
6709    }
6710
6711    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6712    tex->operands[0] = arg;
6713    tex->operands[1] = Operand(resource);
6714    tex->operands[2] = Operand(sampler);
6715    tex->dim = dim;
6716    tex->dmask = dmask;
6717    tex->da = da;
6718    tex->definitions[0] = Definition(tmp_dst);
6719    tex->can_reorder = true;
6720    ctx->block->instructions.emplace_back(std::move(tex));
6721
6722    if (tg4_integer_cube_workaround) {
6723       assert(tmp_dst.id() != dst.id());
6724       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6725
6726       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6727       Temp val[4];
6728       for (unsigned i = 0; i < dst.size(); i++) {
6729          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6730          Temp cvt_val;
6731          if (stype == GLSL_TYPE_UINT)
6732             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6733          else
6734             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6735          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6736       }
6737       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6738       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6739                            val[0], val[1], val[2], val[3]);
6740    }
6741    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6742    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6743
6744 }
6745
6746
6747 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6748 {
6749    Temp tmp = get_ssa_temp(ctx, ssa);
6750    if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6751       return Operand(tmp.regClass());
6752    else
6753       return Operand(tmp);
6754 }
6755
6756 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6757 {
6758    aco_ptr<Pseudo_instruction> phi;
6759    unsigned num_src = exec_list_length(&instr->srcs);
6760    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6761
6762    aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6763
6764    std::map<unsigned, nir_ssa_def*> phi_src;
6765    bool all_undef = true;
6766    nir_foreach_phi_src(src, instr) {
6767       phi_src[src->pred->index] = src->src.ssa;
6768       if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef)
6769          all_undef = false;
6770    }
6771    if (all_undef) {
6772       Builder bld(ctx->program, ctx->block);
6773       if (dst.regClass() == s1) {
6774          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6775       } else if (dst.regClass() == v1) {
6776          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6777       } else {
6778          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6779          for (unsigned i = 0; i < dst.size(); i++)
6780             vec->operands[i] = Operand(0u);
6781          vec->definitions[0] = Definition(dst);
6782          ctx->block->instructions.emplace_back(std::move(vec));
6783       }
6784       return;
6785    }
6786
6787    /* try to scalarize vector phis */
6788    if (dst.size() > 1) {
6789       // TODO: scalarize linear phis on divergent ifs
6790       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
6791       std::array<Temp, 4> new_vec;
6792       for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) {
6793          Operand src = get_phi_operand(ctx, pair.second);
6794          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) {
6795             can_scalarize = false;
6796             break;
6797          }
6798       }
6799       if (can_scalarize) {
6800          unsigned num_components = instr->dest.ssa.num_components;
6801          assert(dst.size() % num_components == 0);
6802          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
6803
6804          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
6805          for (unsigned k = 0; k < num_components; k++) {
6806             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1));
6807             std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6808             for (unsigned i = 0; i < num_src; i++) {
6809                Operand src = get_phi_operand(ctx, it->second);
6810                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
6811                ++it;
6812             }
6813             Temp phi_dst = {ctx->program->allocateId(), rc};
6814             phi->definitions[0] = Definition(phi_dst);
6815             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6816             new_vec[k] = phi_dst;
6817             vec->operands[k] = Operand(phi_dst);
6818          }
6819          vec->definitions[0] = Definition(dst);
6820          ctx->block->instructions.emplace_back(std::move(vec));
6821          ctx->allocated_vec.emplace(dst.id(), new_vec);
6822          return;
6823       }
6824    }
6825
6826    unsigned extra_src = 0;
6827    if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) &&
6828        ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) {
6829       extra_src++;
6830    }
6831
6832    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1));
6833
6834    /* if we have a linear phi on a divergent if, we know that one src is undef */
6835    if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) {
6836       assert(extra_src == 0);
6837       Block* block;
6838       /* we place the phi either in the invert-block or in the current block */
6839       if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) {
6840          assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef);
6841          Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]];
6842          block = &ctx->program->blocks[linear_else.linear_preds[0]];
6843          assert(block->kind & block_kind_invert);
6844          phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second);
6845       } else {
6846          assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef);
6847          block = ctx->block;
6848          phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second);
6849       }
6850       phi->operands[1] = Operand(dst.regClass());
6851       phi->definitions[0] = Definition(dst);
6852       block->instructions.emplace(block->instructions.begin(), std::move(phi));
6853       return;
6854    }
6855
6856    std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6857    for (unsigned i = 0; i < num_src; i++) {
6858       phi->operands[i] = get_phi_operand(ctx, it->second);
6859       ++it;
6860    }
6861    for (unsigned i = 0; i < extra_src; i++)
6862       phi->operands[num_src + i] = Operand(dst.regClass());
6863    phi->definitions[0] = Definition(dst);
6864    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6865 }
6866
6867
6868 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
6869 {
6870    Temp dst = get_ssa_temp(ctx, &instr->def);
6871
6872    assert(dst.type() == RegType::sgpr);
6873
6874    if (dst.size() == 1) {
6875       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
6876    } else {
6877       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6878       for (unsigned i = 0; i < dst.size(); i++)
6879          vec->operands[i] = Operand(0u);
6880       vec->definitions[0] = Definition(dst);
6881       ctx->block->instructions.emplace_back(std::move(vec));
6882    }
6883 }
6884
6885 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
6886 {
6887    Builder bld(ctx->program, ctx->block);
6888    Block *logical_target;
6889    append_logical_end(ctx->block);
6890    unsigned idx = ctx->block->index;
6891
6892    switch (instr->type) {
6893    case nir_jump_break:
6894       logical_target = ctx->cf_info.parent_loop.exit;
6895       add_logical_edge(idx, logical_target);
6896       ctx->block->kind |= block_kind_break;
6897
6898       if (!ctx->cf_info.parent_if.is_divergent &&
6899           !ctx->cf_info.parent_loop.has_divergent_continue) {
6900          /* uniform break - directly jump out of the loop */
6901          ctx->block->kind |= block_kind_uniform;
6902          ctx->cf_info.has_branch = true;
6903          bld.branch(aco_opcode::p_branch);
6904          add_linear_edge(idx, logical_target);
6905          return;
6906       }
6907       ctx->cf_info.parent_loop.has_divergent_branch = true;
6908       break;
6909    case nir_jump_continue:
6910       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6911       add_logical_edge(idx, logical_target);
6912       ctx->block->kind |= block_kind_continue;
6913
6914       if (ctx->cf_info.parent_if.is_divergent) {
6915          /* for potential uniform breaks after this continue,
6916             we must ensure that they are handled correctly */
6917          ctx->cf_info.parent_loop.has_divergent_continue = true;
6918          ctx->cf_info.parent_loop.has_divergent_branch = true;
6919       } else {
6920          /* uniform continue - directly jump to the loop header */
6921          ctx->block->kind |= block_kind_uniform;
6922          ctx->cf_info.has_branch = true;
6923          bld.branch(aco_opcode::p_branch);
6924          add_linear_edge(idx, logical_target);
6925          return;
6926       }
6927       break;
6928    default:
6929       fprintf(stderr, "Unknown NIR jump instr: ");
6930       nir_print_instr(&instr->instr, stderr);
6931       fprintf(stderr, "\n");
6932       abort();
6933    }
6934
6935    /* remove critical edges from linear CFG */
6936    bld.branch(aco_opcode::p_branch);
6937    Block* break_block = ctx->program->create_and_insert_block();
6938    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6939    break_block->kind |= block_kind_uniform;
6940    add_linear_edge(idx, break_block);
6941    /* the loop_header pointer might be invalidated by this point */
6942    if (instr->type == nir_jump_continue)
6943       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6944    add_linear_edge(break_block->index, logical_target);
6945    bld.reset(break_block);
6946    bld.branch(aco_opcode::p_branch);
6947
6948    Block* continue_block = ctx->program->create_and_insert_block();
6949    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6950    add_linear_edge(idx, continue_block);
6951    append_logical_start(continue_block);
6952    ctx->block = continue_block;
6953    return;
6954 }
6955
6956 void visit_block(isel_context *ctx, nir_block *block)
6957 {
6958    nir_foreach_instr(instr, block) {
6959       switch (instr->type) {
6960       case nir_instr_type_alu:
6961          visit_alu_instr(ctx, nir_instr_as_alu(instr));
6962          break;
6963       case nir_instr_type_load_const:
6964          visit_load_const(ctx, nir_instr_as_load_const(instr));
6965          break;
6966       case nir_instr_type_intrinsic:
6967          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
6968          break;
6969       case nir_instr_type_tex:
6970          visit_tex(ctx, nir_instr_as_tex(instr));
6971          break;
6972       case nir_instr_type_phi:
6973          visit_phi(ctx, nir_instr_as_phi(instr));
6974          break;
6975       case nir_instr_type_ssa_undef:
6976          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
6977          break;
6978       case nir_instr_type_deref:
6979          break;
6980       case nir_instr_type_jump:
6981          visit_jump(ctx, nir_instr_as_jump(instr));
6982          break;
6983       default:
6984          fprintf(stderr, "Unknown NIR instr type: ");
6985          nir_print_instr(instr, stderr);
6986          fprintf(stderr, "\n");
6987          //abort();
6988       }
6989    }
6990 }
6991
6992
6993
6994 static void visit_loop(isel_context *ctx, nir_loop *loop)
6995 {
6996    append_logical_end(ctx->block);
6997    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
6998    Builder bld(ctx->program, ctx->block);
6999    bld.branch(aco_opcode::p_branch);
7000    unsigned loop_preheader_idx = ctx->block->index;
7001
7002    Block loop_exit = Block();
7003    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7004    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
7005
7006    Block* loop_header = ctx->program->create_and_insert_block();
7007    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
7008    loop_header->kind |= block_kind_loop_header;
7009    add_edge(loop_preheader_idx, loop_header);
7010    ctx->block = loop_header;
7011
7012    /* emit loop body */
7013    unsigned loop_header_idx = loop_header->index;
7014    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
7015    append_logical_start(ctx->block);
7016    visit_cf_list(ctx, &loop->body);
7017
7018    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
7019    if (!ctx->cf_info.has_branch) {
7020       append_logical_end(ctx->block);
7021       if (ctx->cf_info.exec_potentially_empty) {
7022          /* Discards can result in code running with an empty exec mask.
7023           * This would result in divergent breaks not ever being taken. As a
7024           * workaround, break the loop when the loop mask is empty instead of
7025           * always continuing. */
7026          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
7027
7028          /* create "loop_almost_exit" to avoid critical edges */
7029          unsigned block_idx = ctx->block->index;
7030          Block *loop_almost_exit = ctx->program->create_and_insert_block();
7031          loop_almost_exit->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7032          loop_almost_exit->kind = block_kind_uniform;
7033          bld.reset(loop_almost_exit);
7034          bld.branch(aco_opcode::p_branch);
7035
7036          add_linear_edge(block_idx, loop_almost_exit);
7037          add_linear_edge(loop_almost_exit->index, &loop_exit);
7038
7039          ctx->block = &ctx->program->blocks[block_idx];
7040       } else {
7041          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
7042       }
7043       if (!ctx->cf_info.parent_loop.has_divergent_branch)
7044          add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7045       else
7046          add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
7047       bld.reset(ctx->block);
7048       bld.branch(aco_opcode::p_branch);
7049    }
7050
7051    /* fixup phis in loop header from unreachable blocks */
7052    if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
7053       bool linear = ctx->cf_info.has_branch;
7054       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
7055       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
7056          if ((logical && instr->opcode == aco_opcode::p_phi) ||
7057              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
7058             /* the last operand should be the one that needs to be removed */
7059             instr->operands.pop_back();
7060          } else if (!is_phi(instr)) {
7061             break;
7062          }
7063       }
7064    }
7065
7066    ctx->cf_info.has_branch = false;
7067
7068    // TODO: if the loop has not a single exit, we must add one °°
7069    /* emit loop successor block */
7070    ctx->block = ctx->program->insert_block(std::move(loop_exit));
7071    append_logical_start(ctx->block);
7072
7073    #if 0
7074    // TODO: check if it is beneficial to not branch on continues
7075    /* trim linear phis in loop header */
7076    for (auto&& instr : loop_entry->instructions) {
7077       if (instr->opcode == aco_opcode::p_linear_phi) {
7078          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
7079          new_phi->definitions[0] = instr->definitions[0];
7080          for (unsigned i = 0; i < new_phi->operands.size(); i++)
7081             new_phi->operands[i] = instr->operands[i];
7082          /* check that the remaining operands are all the same */
7083          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
7084             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
7085          instr.swap(new_phi);
7086       } else if (instr->opcode == aco_opcode::p_phi) {
7087          continue;
7088       } else {
7089          break;
7090       }
7091    }
7092    #endif
7093 }
7094
7095 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7096 {
7097    ic->cond = cond;
7098
7099    append_logical_end(ctx->block);
7100    ctx->block->kind |= block_kind_branch;
7101
7102    /* branch to linear then block */
7103    assert(cond.regClass() == s2);
7104    aco_ptr<Pseudo_branch_instruction> branch;
7105    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7106    branch->operands[0] = Operand(cond);
7107    ctx->block->instructions.push_back(std::move(branch));
7108
7109    ic->BB_if_idx = ctx->block->index;
7110    ic->BB_invert = Block();
7111    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7112    /* Invert blocks are intentionally not marked as top level because they
7113     * are not part of the logical cfg. */
7114    ic->BB_invert.kind |= block_kind_invert;
7115    ic->BB_endif = Block();
7116    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7117    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7118
7119    ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7120    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7121    ctx->cf_info.parent_if.is_divergent = true;
7122    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7123
7124    /** emit logical then block */
7125    Block* BB_then_logical = ctx->program->create_and_insert_block();
7126    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7127    add_edge(ic->BB_if_idx, BB_then_logical);
7128    ctx->block = BB_then_logical;
7129    append_logical_start(BB_then_logical);
7130 }
7131
7132 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7133 {
7134    Block *BB_then_logical = ctx->block;
7135    append_logical_end(BB_then_logical);
7136     /* branch from logical then block to invert block */
7137    aco_ptr<Pseudo_branch_instruction> branch;
7138    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7139    BB_then_logical->instructions.emplace_back(std::move(branch));
7140    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7141    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7142       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7143    BB_then_logical->kind |= block_kind_uniform;
7144    assert(!ctx->cf_info.has_branch);
7145    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7146    ctx->cf_info.parent_loop.has_divergent_branch = false;
7147
7148    /** emit linear then block */
7149    Block* BB_then_linear = ctx->program->create_and_insert_block();
7150    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7151    BB_then_linear->kind |= block_kind_uniform;
7152    add_linear_edge(ic->BB_if_idx, BB_then_linear);
7153    /* branch from linear then block to invert block */
7154    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7155    BB_then_linear->instructions.emplace_back(std::move(branch));
7156    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7157
7158    /** emit invert merge block */
7159    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7160    ic->invert_idx = ctx->block->index;
7161
7162    /* branch to linear else block (skip else) */
7163    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7164    branch->operands[0] = Operand(ic->cond);
7165    ctx->block->instructions.push_back(std::move(branch));
7166
7167    ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7168    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7169
7170    /** emit logical else block */
7171    Block* BB_else_logical = ctx->program->create_and_insert_block();
7172    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7173    add_logical_edge(ic->BB_if_idx, BB_else_logical);
7174    add_linear_edge(ic->invert_idx, BB_else_logical);
7175    ctx->block = BB_else_logical;
7176    append_logical_start(BB_else_logical);
7177 }
7178
7179 static void end_divergent_if(isel_context *ctx, if_context *ic)
7180 {
7181    Block *BB_else_logical = ctx->block;
7182    append_logical_end(BB_else_logical);
7183
7184    /* branch from logical else block to endif block */
7185    aco_ptr<Pseudo_branch_instruction> branch;
7186    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7187    BB_else_logical->instructions.emplace_back(std::move(branch));
7188    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7189    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7190       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7191    BB_else_logical->kind |= block_kind_uniform;
7192
7193    assert(!ctx->cf_info.has_branch);
7194    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7195
7196
7197    /** emit linear else block */
7198    Block* BB_else_linear = ctx->program->create_and_insert_block();
7199    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7200    BB_else_linear->kind |= block_kind_uniform;
7201    add_linear_edge(ic->invert_idx, BB_else_linear);
7202
7203    /* branch from linear else block to endif block */
7204    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7205    BB_else_linear->instructions.emplace_back(std::move(branch));
7206    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7207
7208
7209    /** emit endif merge block */
7210    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7211    append_logical_start(ctx->block);
7212
7213
7214    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7215    ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7216    /* uniform control flow never has an empty exec-mask */
7217    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7218       ctx->cf_info.exec_potentially_empty = false;
7219 }
7220
7221 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7222 {
7223    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7224    Builder bld(ctx->program, ctx->block);
7225    aco_ptr<Pseudo_branch_instruction> branch;
7226
7227    if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7228       /**
7229        * Uniform conditionals are represented in the following way*) :
7230        *
7231        * The linear and logical CFG:
7232        *                        BB_IF
7233        *                        /    \
7234        *       BB_THEN (logical)      BB_ELSE (logical)
7235        *                        \    /
7236        *                        BB_ENDIF
7237        *
7238        * *) Exceptions may be due to break and continue statements within loops
7239        *    If a break/continue happens within uniform control flow, it branches
7240        *    to the loop exit/entry block. Otherwise, it branches to the next
7241        *    merge block.
7242        **/
7243       append_logical_end(ctx->block);
7244       ctx->block->kind |= block_kind_uniform;
7245
7246       /* emit branch */
7247       if (cond.regClass() == s2) {
7248          // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7249          cond = as_uniform_bool(ctx, cond);
7250       }
7251       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7252       branch->operands[0] = Operand(cond);
7253       branch->operands[0].setFixed(scc);
7254       ctx->block->instructions.emplace_back(std::move(branch));
7255
7256       unsigned BB_if_idx = ctx->block->index;
7257       Block BB_endif = Block();
7258       BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7259       BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7260
7261       /** emit then block */
7262       Block* BB_then = ctx->program->create_and_insert_block();
7263       BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7264       add_edge(BB_if_idx, BB_then);
7265       append_logical_start(BB_then);
7266       ctx->block = BB_then;
7267       visit_cf_list(ctx, &if_stmt->then_list);
7268       BB_then = ctx->block;
7269       bool then_branch = ctx->cf_info.has_branch;
7270       bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7271
7272       if (!then_branch) {
7273          append_logical_end(BB_then);
7274          /* branch from then block to endif block */
7275          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7276          BB_then->instructions.emplace_back(std::move(branch));
7277          add_linear_edge(BB_then->index, &BB_endif);
7278          if (!then_branch_divergent)
7279             add_logical_edge(BB_then->index, &BB_endif);
7280          BB_then->kind |= block_kind_uniform;
7281       }
7282
7283       ctx->cf_info.has_branch = false;
7284       ctx->cf_info.parent_loop.has_divergent_branch = false;
7285
7286       /** emit else block */
7287       Block* BB_else = ctx->program->create_and_insert_block();
7288       BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7289       add_edge(BB_if_idx, BB_else);
7290       append_logical_start(BB_else);
7291       ctx->block = BB_else;
7292       visit_cf_list(ctx, &if_stmt->else_list);
7293       BB_else = ctx->block;
7294
7295       if (!ctx->cf_info.has_branch) {
7296          append_logical_end(BB_else);
7297          /* branch from then block to endif block */
7298          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7299          BB_else->instructions.emplace_back(std::move(branch));
7300          add_linear_edge(BB_else->index, &BB_endif);
7301          if (!ctx->cf_info.parent_loop.has_divergent_branch)
7302             add_logical_edge(BB_else->index, &BB_endif);
7303          BB_else->kind |= block_kind_uniform;
7304       }
7305
7306       ctx->cf_info.has_branch &= then_branch;
7307       ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7308
7309       /** emit endif merge block */
7310       if (!ctx->cf_info.has_branch) {
7311          ctx->block = ctx->program->insert_block(std::move(BB_endif));
7312          append_logical_start(ctx->block);
7313       }
7314    } else { /* non-uniform condition */
7315       /**
7316        * To maintain a logical and linear CFG without critical edges,
7317        * non-uniform conditionals are represented in the following way*) :
7318        *
7319        * The linear CFG:
7320        *                        BB_IF
7321        *                        /    \
7322        *       BB_THEN (logical)      BB_THEN (linear)
7323        *                        \    /
7324        *                        BB_INVERT (linear)
7325        *                        /    \
7326        *       BB_ELSE (logical)      BB_ELSE (linear)
7327        *                        \    /
7328        *                        BB_ENDIF
7329        *
7330        * The logical CFG:
7331        *                        BB_IF
7332        *                        /    \
7333        *       BB_THEN (logical)      BB_ELSE (logical)
7334        *                        \    /
7335        *                        BB_ENDIF
7336        *
7337        * *) Exceptions may be due to break and continue statements within loops
7338        **/
7339
7340       if_context ic;
7341
7342       begin_divergent_if_then(ctx, &ic, cond);
7343       visit_cf_list(ctx, &if_stmt->then_list);
7344
7345       begin_divergent_if_else(ctx, &ic);
7346       visit_cf_list(ctx, &if_stmt->else_list);
7347
7348       end_divergent_if(ctx, &ic);
7349    }
7350 }
7351
7352 static void visit_cf_list(isel_context *ctx,
7353                           struct exec_list *list)
7354 {
7355    foreach_list_typed(nir_cf_node, node, node, list) {
7356       switch (node->type) {
7357       case nir_cf_node_block:
7358          visit_block(ctx, nir_cf_node_as_block(node));
7359          break;
7360       case nir_cf_node_if:
7361          visit_if(ctx, nir_cf_node_as_if(node));
7362          break;
7363       case nir_cf_node_loop:
7364          visit_loop(ctx, nir_cf_node_as_loop(node));
7365          break;
7366       default:
7367          unreachable("unimplemented cf list type");
7368       }
7369    }
7370 }
7371
7372 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7373 {
7374    int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7375    uint64_t mask = ctx->vs_output.mask[slot];
7376    if (!is_pos && !mask)
7377       return;
7378    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7379       return;
7380    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7381    exp->enabled_mask = mask;
7382    for (unsigned i = 0; i < 4; ++i) {
7383       if (mask & (1 << i))
7384          exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7385       else
7386          exp->operands[i] = Operand(v1);
7387    }
7388    exp->valid_mask = false;
7389    exp->done = false;
7390    exp->compressed = false;
7391    if (is_pos)
7392       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7393    else
7394       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7395    ctx->block->instructions.emplace_back(std::move(exp));
7396 }
7397
7398 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7399 {
7400    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7401    exp->enabled_mask = 0;
7402    for (unsigned i = 0; i < 4; ++i)
7403       exp->operands[i] = Operand(v1);
7404    if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7405       exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7406       exp->enabled_mask |= 0x1;
7407    }
7408    if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7409       exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7410       exp->enabled_mask |= 0x4;
7411    }
7412    if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7413       if (ctx->options->chip_class < GFX9) {
7414          exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7415          exp->enabled_mask |= 0x8;
7416       } else {
7417          Builder bld(ctx->program, ctx->block);
7418
7419          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7420                              Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7421          if (exp->operands[2].isTemp())
7422             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7423
7424          exp->operands[2] = Operand(out);
7425          exp->enabled_mask |= 0x4;
7426       }
7427    }
7428    exp->valid_mask = false;
7429    exp->done = false;
7430    exp->compressed = false;
7431    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7432    ctx->block->instructions.emplace_back(std::move(exp));
7433 }
7434
7435 static void create_vs_exports(isel_context *ctx)
7436 {
7437    radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7438
7439    if (outinfo->export_prim_id) {
7440       ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7441       ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id;
7442    }
7443
7444    if (ctx->options->key.has_multiview_view_index) {
7445       ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7446       ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index);
7447    }
7448
7449    /* the order these position exports are created is important */
7450    int next_pos = 0;
7451    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7452    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7453       export_vs_psiz_layer_viewport(ctx, &next_pos);
7454    }
7455    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7456       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7457    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7458       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7459
7460    if (ctx->options->key.vs_common_out.export_clip_dists) {
7461       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7462          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7463       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7464          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7465    }
7466
7467    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7468       if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7469           i != VARYING_SLOT_PRIMITIVE_ID)
7470          continue;
7471
7472       export_vs_varying(ctx, i, false, NULL);
7473    }
7474 }
7475
7476 static void emit_stream_output(isel_context *ctx,
7477                                Temp const *so_buffers,
7478                                Temp const *so_write_offset,
7479                                const struct radv_stream_output *output)
7480 {
7481    unsigned num_comps = util_bitcount(output->component_mask);
7482    unsigned loc = output->location;
7483    unsigned buf = output->buffer;
7484    unsigned offset = output->offset;
7485
7486    assert(num_comps && num_comps <= 4);
7487    if (!num_comps || num_comps > 4)
7488       return;
7489
7490    unsigned start = ffs(output->component_mask) - 1;
7491
7492    Temp out[4];
7493    bool all_undef = true;
7494    assert(ctx->stage == vertex_vs);
7495    for (unsigned i = 0; i < num_comps; i++) {
7496       out[i] = ctx->vs_output.outputs[loc][start + i];
7497       all_undef = all_undef && !out[i].id();
7498    }
7499    if (all_undef)
7500       return;
7501
7502    Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7503    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7504    for (unsigned i = 0; i < num_comps; ++i)
7505       vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7506    vec->definitions[0] = Definition(write_data);
7507    ctx->block->instructions.emplace_back(std::move(vec));
7508
7509    aco_opcode opcode;
7510    switch (num_comps) {
7511    case 1:
7512       opcode = aco_opcode::buffer_store_dword;
7513       break;
7514    case 2:
7515       opcode = aco_opcode::buffer_store_dwordx2;
7516       break;
7517    case 3:
7518       opcode = aco_opcode::buffer_store_dwordx3;
7519       break;
7520    case 4:
7521       opcode = aco_opcode::buffer_store_dwordx4;
7522       break;
7523    }
7524
7525    aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7526    store->operands[0] = Operand(so_write_offset[buf]);
7527    store->operands[1] = Operand(so_buffers[buf]);
7528    store->operands[2] = Operand((uint32_t) 0);
7529    store->operands[3] = Operand(write_data);
7530    if (offset > 4095) {
7531       /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7532       Builder bld(ctx->program, ctx->block);
7533       store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7534    } else {
7535       store->offset = offset;
7536    }
7537    store->offen = true;
7538    store->glc = true;
7539    store->dlc = false;
7540    store->slc = true;
7541    store->can_reorder = true;
7542    ctx->block->instructions.emplace_back(std::move(store));
7543 }
7544
7545 static void emit_streamout(isel_context *ctx, unsigned stream)
7546 {
7547    Builder bld(ctx->program, ctx->block);
7548
7549    Temp so_buffers[4];
7550    Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers);
7551    for (unsigned i = 0; i < 4; i++) {
7552       unsigned stride = ctx->program->info->so.strides[i];
7553       if (!stride)
7554          continue;
7555
7556       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7557    }
7558
7559    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7560                                 ctx->streamout_config, Operand(0x70010u));
7561
7562    Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7563                        bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7564
7565    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7566
7567    if_context ic;
7568    begin_divergent_if_then(ctx, &ic, can_emit);
7569
7570    bld.reset(ctx->block);
7571
7572    Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid);
7573
7574    Temp so_write_offset[4];
7575
7576    for (unsigned i = 0; i < 4; i++) {
7577       unsigned stride = ctx->program->info->so.strides[i];
7578       if (!stride)
7579          continue;
7580
7581       if (stride == 1) {
7582          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7583                                 ctx->streamout_write_idx, ctx->streamout_offset[i]);
7584          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7585
7586          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7587       } else {
7588          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7589          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]);
7590          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7591       }
7592    }
7593
7594    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7595       struct radv_stream_output *output =
7596          &ctx->program->info->so.outputs[i];
7597       if (stream != output->stream)
7598          continue;
7599
7600       emit_stream_output(ctx, so_buffers, so_write_offset, output);
7601    }
7602
7603    begin_divergent_if_else(ctx, &ic);
7604    end_divergent_if(ctx, &ic);
7605 }
7606
7607 } /* end namespace */
7608
7609 void handle_bc_optimize(isel_context *ctx)
7610 {
7611    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7612    Builder bld(ctx->program, ctx->block);
7613    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7614    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7615    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7616    if (uses_center && uses_centroid) {
7617       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u));
7618
7619       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7620          for (unsigned i = 0; i < 2; i++) {
7621             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7622                                       ctx->fs_inputs[fs_input::persp_centroid_p1 + i],
7623                                       ctx->fs_inputs[fs_input::persp_center_p1 + i],
7624                                       sel);
7625             ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord;
7626          }
7627       }
7628
7629       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7630          for (unsigned i = 0; i < 2; i++) {
7631             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7632                                       ctx->fs_inputs[fs_input::linear_centroid_p1 + i],
7633                                       ctx->fs_inputs[fs_input::linear_center_p1 + i],
7634                                       sel);
7635             ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord;
7636          }
7637       }
7638    }
7639 }
7640
7641 void select_program(Program *program,
7642                     unsigned shader_count,
7643                     struct nir_shader *const *shaders,
7644                     ac_shader_config* config,
7645                     struct radv_shader_info *info,
7646                     struct radv_nir_compiler_options *options)
7647 {
7648    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options);
7649
7650    for (unsigned i = 0; i < shader_count; i++) {
7651       nir_shader *nir = shaders[i];
7652       init_context(&ctx, nir);
7653
7654       if (!i) {
7655          add_startpgm(&ctx); /* needs to be after init_context() for FS */
7656          append_logical_start(ctx.block);
7657       }
7658
7659       if_context ic;
7660       if (shader_count >= 2) {
7661          Builder bld(ctx.program, ctx.block);
7662          Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7663          Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7664                                    bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7665          Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7666
7667          begin_divergent_if_then(&ctx, &ic, cond);
7668       }
7669
7670       if (i) {
7671          Builder bld(ctx.program, ctx.block);
7672          bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7673          bld.sopp(aco_opcode::s_barrier);
7674       }
7675
7676       if (ctx.stage == fragment_fs)
7677          handle_bc_optimize(&ctx);
7678
7679       nir_function_impl *func = nir_shader_get_entrypoint(nir);
7680       visit_cf_list(&ctx, &func->body);
7681
7682       if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7683          emit_streamout(&ctx, 0);
7684
7685       if (ctx.stage == vertex_vs)
7686          create_vs_exports(&ctx);
7687
7688       if (shader_count >= 2) {
7689          begin_divergent_if_else(&ctx, &ic);
7690          end_divergent_if(&ctx, &ic);
7691       }
7692
7693       ralloc_free(ctx.divergent_vals);
7694    }
7695
7696    append_logical_end(ctx.block);
7697    ctx.block->kind |= block_kind_uniform;
7698    Builder bld(ctx.program, ctx.block);
7699    if (ctx.program->wb_smem_l1_on_end)
7700       bld.smem(aco_opcode::s_dcache_wb, false);
7701    bld.sopp(aco_opcode::s_endpgm);
7702
7703    /* cleanup CFG */
7704    for (Block& BB : program->blocks) {
7705       for (unsigned idx : BB.linear_preds)
7706          program->blocks[idx].linear_succs.emplace_back(BB.index);
7707       for (unsigned idx : BB.logical_preds)
7708          program->blocks[idx].logical_succs.emplace_back(BB.index);
7709    }
7710 }
7711 }