src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <map>
  29
  30 #include "ac_shader_util.h"
  31 #include "aco_ir.h"
  32 #include "aco_builder.h"
  33 #include "aco_interface.h"
  34 #include "aco_instruction_selection_setup.cpp"
  35 #include "util/fast_idiv_by_const.h"
  36
  37 namespace aco {
  38 namespace {
  39
  40 class loop_info_RAII {
  41    isel_context* ctx;
  42    unsigned header_idx_old;
  43    Block* exit_old;
  44    bool divergent_cont_old;
  45    bool divergent_branch_old;
  46    bool divergent_if_old;
  47
  48 public:
  49    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  50       : ctx(ctx),
  51         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  52         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  53         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  54         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  55    {
  56       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  57       ctx->cf_info.parent_loop.exit = loop_exit;
  58       ctx->cf_info.parent_loop.has_divergent_continue = false;
  59       ctx->cf_info.parent_loop.has_divergent_branch = false;
  60       ctx->cf_info.parent_if.is_divergent = false;
  61       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  62    }
  63
  64    ~loop_info_RAII()
  65    {
  66       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  67       ctx->cf_info.parent_loop.exit = exit_old;
  68       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  69       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  70       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  71       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  72       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  73          ctx->cf_info.exec_potentially_empty = false;
  74    }
  75 };
  76
  77 struct if_context {
  78    Temp cond;
  79
  80    bool divergent_old;
  81    bool exec_potentially_empty_old;
  82
  83    unsigned BB_if_idx;
  84    unsigned invert_idx;
  85    bool then_branch_divergent;
  86    Block BB_invert;
  87    Block BB_endif;
  88 };
  89
  90 static void visit_cf_list(struct isel_context *ctx,
  91                           struct exec_list *list);
  92
  93 static void add_logical_edge(unsigned pred_idx, Block *succ)
  94 {
  95    succ->logical_preds.emplace_back(pred_idx);
  96 }
  97
  98
  99 static void add_linear_edge(unsigned pred_idx, Block *succ)
 100 {
 101    succ->linear_preds.emplace_back(pred_idx);
 102 }
 103
 104 static void add_edge(unsigned pred_idx, Block *succ)
 105 {
 106    add_logical_edge(pred_idx, succ);
 107    add_linear_edge(pred_idx, succ);
 108 }
 109
 110 static void append_logical_start(Block *b)
 111 {
 112    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 113 }
 114
 115 static void append_logical_end(Block *b)
 116 {
 117    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 118 }
 119
 120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 121 {
 122    assert(ctx->allocated[def->index].id());
 123    return ctx->allocated[def->index];
 124 }
 125
 126 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 127 {
 128    Builder bld(ctx->program, ctx->block);
 129
 130    if (!dst.id())
 131       dst = bld.tmp(src.regClass());
 132
 133    if (ctx->stage != fragment_fs) {
 134       if (!dst.id())
 135          return src;
 136
 137       if (src.type() == RegType::vgpr || src.size() > 1)
 138          bld.copy(Definition(dst), src);
 139       else
 140          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 141       return dst;
 142    }
 143
 144    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 145    ctx->program->needs_wqm |= program_needs_wqm;
 146    return dst;
 147 }
 148
 149 Temp as_vgpr(isel_context *ctx, Temp val)
 150 {
 151    if (val.type() == RegType::sgpr) {
 152       Builder bld(ctx->program, ctx->block);
 153       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 154    }
 155    assert(val.type() == RegType::vgpr);
 156    return val;
 157 }
 158
 159 //assumes a != 0xffffffff
 160 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 161 {
 162    assert(b != 0);
 163    Builder bld(ctx->program, ctx->block);
 164
 165    if (util_is_power_of_two_or_zero(b)) {
 166       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 167       return;
 168    }
 169
 170    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 171
 172    assert(info.multiplier <= 0xffffffff);
 173
 174    bool pre_shift = info.pre_shift != 0;
 175    bool increment = info.increment != 0;
 176    bool multiply = true;
 177    bool post_shift = info.post_shift != 0;
 178
 179    if (!pre_shift && !increment && !multiply && !post_shift) {
 180       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 181       return;
 182    }
 183
 184    Temp pre_shift_dst = a;
 185    if (pre_shift) {
 186       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 187       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 188    }
 189
 190    Temp increment_dst = pre_shift_dst;
 191    if (increment) {
 192       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 193       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 194    }
 195
 196    Temp multiply_dst = increment_dst;
 197    if (multiply) {
 198       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 199       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 200                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 201    }
 202
 203    if (post_shift) {
 204       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 205    }
 206 }
 207
 208 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 209 {
 210    Builder bld(ctx->program, ctx->block);
 211    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 212 }
 213
 214
 215 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 216 {
 217    /* no need to extract the whole vector */
 218    if (src.regClass() == dst_rc) {
 219       assert(idx == 0);
 220       return src;
 221    }
 222    assert(src.size() > idx);
 223    Builder bld(ctx->program, ctx->block);
 224    auto it = ctx->allocated_vec.find(src.id());
 225    /* the size check needs to be early because elements other than 0 may be garbage */
 226    if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
 227       if (it->second[idx].regClass() == dst_rc) {
 228          return it->second[idx];
 229       } else {
 230          assert(dst_rc.size() == it->second[idx].regClass().size());
 231          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 232          return bld.copy(bld.def(dst_rc), it->second[idx]);
 233       }
 234    }
 235
 236    if (src.size() == dst_rc.size()) {
 237       assert(idx == 0);
 238       return bld.copy(bld.def(dst_rc), src);
 239    } else {
 240       Temp dst = bld.tmp(dst_rc);
 241       emit_extract_vector(ctx, src, idx, dst);
 242       return dst;
 243    }
 244 }
 245
 246 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 247 {
 248    if (num_components == 1)
 249       return;
 250    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 251       return;
 252    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 253    split->operands[0] = Operand(vec_src);
 254    std::array<Temp,4> elems;
 255    for (unsigned i = 0; i < num_components; i++) {
 256       elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
 257       split->definitions[i] = Definition(elems[i]);
 258    }
 259    ctx->block->instructions.emplace_back(std::move(split));
 260    ctx->allocated_vec.emplace(vec_src.id(), elems);
 261 }
 262
 263 /* This vector expansion uses a mask to determine which elements in the new vector
 264  * come from the original vector. The other elements are undefined. */
 265 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 266 {
 267    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 268
 269    if (vec_src == dst)
 270       return;
 271
 272    Builder bld(ctx->program, ctx->block);
 273    if (num_components == 1) {
 274       if (dst.type() == RegType::sgpr)
 275          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 276       else
 277          bld.copy(Definition(dst), vec_src);
 278       return;
 279    }
 280
 281    unsigned component_size = dst.size() / num_components;
 282    std::array<Temp,4> elems;
 283
 284    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 285    vec->definitions[0] = Definition(dst);
 286    unsigned k = 0;
 287    for (unsigned i = 0; i < num_components; i++) {
 288       if (mask & (1 << i)) {
 289          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 290          if (dst.type() == RegType::sgpr)
 291             src = bld.as_uniform(src);
 292          vec->operands[i] = Operand(src);
 293       } else {
 294          vec->operands[i] = Operand(0u);
 295       }
 296       elems[i] = vec->operands[i].getTemp();
 297    }
 298    ctx->block->instructions.emplace_back(std::move(vec));
 299    ctx->allocated_vec.emplace(dst.id(), elems);
 300 }
 301
 302 Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
 303 {
 304    if (val.regClass() == s2) {
 305       return val;
 306    } else {
 307       assert(val.regClass() == s1);
 308       Builder bld(ctx->program, ctx->block);
 309       Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
 310                                  Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
 311       if (vcc_hint)
 312          def.setHint(vcc);
 313       return def.getTemp();
 314    }
 315 }
 316
 317 Temp as_uniform_bool(isel_context *ctx, Temp val)
 318 {
 319    if (val.regClass() == s1) {
 320       return val;
 321    } else {
 322       assert(val.regClass() == s2);
 323       Builder bld(ctx->program, ctx->block);
 324       return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), Operand(val));
 325    }
 326 }
 327
 328 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 329 {
 330    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 331       return get_ssa_temp(ctx, src.src.ssa);
 332
 333    if (src.src.ssa->num_components == size) {
 334       bool identity_swizzle = true;
 335       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 336          if (src.swizzle[i] != i)
 337             identity_swizzle = false;
 338       }
 339       if (identity_swizzle)
 340          return get_ssa_temp(ctx, src.src.ssa);
 341    }
 342
 343    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 344    unsigned elem_size = vec.size() / src.src.ssa->num_components;
 345    assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
 346    assert(vec.size() % elem_size == 0);
 347
 348    RegClass elem_rc = RegClass(vec.type(), elem_size);
 349    if (size == 1) {
 350       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 351    } else {
 352       assert(size <= 4);
 353       std::array<Temp,4> elems;
 354       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 355       for (unsigned i = 0; i < size; ++i) {
 356          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 357          vec_instr->operands[i] = Operand{elems[i]};
 358       }
 359       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
 360       vec_instr->definitions[0] = Definition(dst);
 361       ctx->block->instructions.emplace_back(std::move(vec_instr));
 362       ctx->allocated_vec.emplace(dst.id(), elems);
 363       return dst;
 364    }
 365 }
 366
 367 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 368 {
 369    if (ptr.size() == 2)
 370       return ptr;
 371    Builder bld(ctx->program, ctx->block);
 372    if (ptr.type() == RegType::vgpr)
 373       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 374    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 375                      ptr, Operand((unsigned)ctx->options->address32_hi));
 376 }
 377
 378 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 379 {
 380    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 381    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 382    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 383    sop2->definitions[0] = Definition(dst);
 384    if (writes_scc)
 385       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 386    ctx->block->instructions.emplace_back(std::move(sop2));
 387 }
 388
 389 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
 390 {
 391    Builder bld(ctx->program, ctx->block);
 392    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 393    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 394    if (src1.type() == RegType::sgpr) {
 395       if (commutative && src0.type() == RegType::vgpr) {
 396          Temp t = src0;
 397          src0 = src1;
 398          src1 = t;
 399       } else if (src0.type() == RegType::vgpr &&
 400                  op != aco_opcode::v_madmk_f32 &&
 401                  op != aco_opcode::v_madak_f32 &&
 402                  op != aco_opcode::v_madmk_f16 &&
 403                  op != aco_opcode::v_madak_f16) {
 404          /* If the instruction is not commutative, we emit a VOP3A instruction */
 405          bld.vop2_e64(op, Definition(dst), src0, src1);
 406          return;
 407       } else {
 408          src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
 409       }
 410    }
 411    bld.vop2(op, Definition(dst), src0, src1);
 412 }
 413
 414 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 415 {
 416    Temp src0 = get_alu_src(ctx, instr->src[0]);
 417    Temp src1 = get_alu_src(ctx, instr->src[1]);
 418    Temp src2 = get_alu_src(ctx, instr->src[2]);
 419
 420    /* ensure that the instruction has at most 1 sgpr operand
 421     * The optimizer will inline constants for us */
 422    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 423       src0 = as_vgpr(ctx, src0);
 424    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 425       src1 = as_vgpr(ctx, src1);
 426    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 427       src2 = as_vgpr(ctx, src2);
 428
 429    Builder bld(ctx->program, ctx->block);
 430    bld.vop3(op, Definition(dst), src0, src1, src2);
 431 }
 432
 433 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 434 {
 435    Builder bld(ctx->program, ctx->block);
 436    bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 437 }
 438
 439 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 440 {
 441    Temp src0 = get_alu_src(ctx, instr->src[0]);
 442    Temp src1 = get_alu_src(ctx, instr->src[1]);
 443    aco_ptr<Instruction> vopc;
 444    if (src1.type() == RegType::sgpr) {
 445       if (src0.type() == RegType::vgpr) {
 446          /* to swap the operands, we might also have to change the opcode */
 447          switch (op) {
 448             case aco_opcode::v_cmp_lt_f32:
 449                op = aco_opcode::v_cmp_gt_f32;
 450                break;
 451             case aco_opcode::v_cmp_ge_f32:
 452                op = aco_opcode::v_cmp_le_f32;
 453                break;
 454             case aco_opcode::v_cmp_lt_i32:
 455                op = aco_opcode::v_cmp_gt_i32;
 456                break;
 457             case aco_opcode::v_cmp_ge_i32:
 458                op = aco_opcode::v_cmp_le_i32;
 459                break;
 460             case aco_opcode::v_cmp_lt_u32:
 461                op = aco_opcode::v_cmp_gt_u32;
 462                break;
 463             case aco_opcode::v_cmp_ge_u32:
 464                op = aco_opcode::v_cmp_le_u32;
 465                break;
 466             case aco_opcode::v_cmp_lt_f64:
 467                op = aco_opcode::v_cmp_gt_f64;
 468                break;
 469             case aco_opcode::v_cmp_ge_f64:
 470                op = aco_opcode::v_cmp_le_f64;
 471                break;
 472             case aco_opcode::v_cmp_lt_i64:
 473                op = aco_opcode::v_cmp_gt_i64;
 474                break;
 475             case aco_opcode::v_cmp_ge_i64:
 476                op = aco_opcode::v_cmp_le_i64;
 477                break;
 478             case aco_opcode::v_cmp_lt_u64:
 479                op = aco_opcode::v_cmp_gt_u64;
 480                break;
 481             case aco_opcode::v_cmp_ge_u64:
 482                op = aco_opcode::v_cmp_le_u64;
 483                break;
 484             default: /* eq and ne are commutative */
 485                break;
 486          }
 487          Temp t = src0;
 488          src0 = src1;
 489          src1 = t;
 490       } else {
 491          src1 = as_vgpr(ctx, src1);
 492       }
 493    }
 494    Builder bld(ctx->program, ctx->block);
 495    bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
 496 }
 497
 498 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 499 {
 500    if (dst.regClass() == s2) {
 501       emit_vopc_instruction(ctx, instr, op, dst);
 502       if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
 503          emit_split_vector(ctx, dst, 2);
 504    } else if (dst.regClass() == s1) {
 505       Temp src0 = get_alu_src(ctx, instr->src[0]);
 506       Temp src1 = get_alu_src(ctx, instr->src[1]);
 507       assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
 508
 509       Builder bld(ctx->program, ctx->block);
 510       bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
 511
 512    } else {
 513       assert(false);
 514    }
 515 }
 516
 517 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
 518 {
 519    Builder bld(ctx->program, ctx->block);
 520    Temp src0 = get_alu_src(ctx, instr->src[0]);
 521    Temp src1 = get_alu_src(ctx, instr->src[1]);
 522    if (dst.regClass() == s2) {
 523       bld.sop2(op64, Definition(dst), bld.def(s1, scc),
 524                as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
 525    } else {
 526       assert(dst.regClass() == s1);
 527       bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
 528                as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
 529    }
 530 }
 531
 532
 533 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 534 {
 535    Builder bld(ctx->program, ctx->block);
 536    Temp cond = get_alu_src(ctx, instr->src[0]);
 537    Temp then = get_alu_src(ctx, instr->src[1]);
 538    Temp els = get_alu_src(ctx, instr->src[2]);
 539
 540    if (dst.type() == RegType::vgpr) {
 541       cond = as_divergent_bool(ctx, cond, true);
 542
 543       aco_ptr<Instruction> bcsel;
 544       if (dst.size() == 1) {
 545          then = as_vgpr(ctx, then);
 546          els = as_vgpr(ctx, els);
 547
 548          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 549       } else if (dst.size() == 2) {
 550          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 551          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 552          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 553          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 554
 555          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 556          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 557
 558          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 559       } else {
 560          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 561          nir_print_instr(&instr->instr, stderr);
 562          fprintf(stderr, "\n");
 563       }
 564       return;
 565    }
 566
 567    if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
 568       if (dst.regClass() == s1 || dst.regClass() == s2) {
 569          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 570          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 571          bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
 572       } else {
 573          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 574          nir_print_instr(&instr->instr, stderr);
 575          fprintf(stderr, "\n");
 576       }
 577       return;
 578    }
 579
 580    /* boolean bcsel */
 581    assert(instr->dest.dest.ssa.bit_size == 1);
 582
 583    if (dst.regClass() == s1)
 584       cond = as_uniform_bool(ctx, cond);
 585
 586    if (cond.regClass() == s1) { /* uniform selection */
 587       aco_opcode op;
 588       if (dst.regClass() == s2) {
 589          op = aco_opcode::s_cselect_b64;
 590          then = as_divergent_bool(ctx, then, false);
 591          els = as_divergent_bool(ctx, els, false);
 592       } else {
 593          assert(dst.regClass() == s1);
 594          op = aco_opcode::s_cselect_b32;
 595          then = as_uniform_bool(ctx, then);
 596          els = as_uniform_bool(ctx, els);
 597       }
 598       bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
 599       return;
 600    }
 601
 602    /* divergent boolean bcsel
 603     * this implements bcsel on bools: dst = s0 ? s1 : s2
 604     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 605    assert (dst.regClass() == s2);
 606    then = as_divergent_bool(ctx, then, false);
 607    els = as_divergent_bool(ctx, els, false);
 608
 609    if (cond.id() != then.id())
 610       then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
 611
 612    if (cond.id() == els.id())
 613       bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
 614    else
 615       bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
 616                bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
 617 }
 618
 619 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 620 {
 621    if (!instr->dest.dest.is_ssa) {
 622       fprintf(stderr, "nir alu dst not in ssa: ");
 623       nir_print_instr(&instr->instr, stderr);
 624       fprintf(stderr, "\n");
 625       abort();
 626    }
 627    Builder bld(ctx->program, ctx->block);
 628    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
 629    switch(instr->op) {
 630    case nir_op_vec2:
 631    case nir_op_vec3:
 632    case nir_op_vec4: {
 633       std::array<Temp,4> elems;
 634       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
 635       for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
 636          elems[i] = get_alu_src(ctx, instr->src[i]);
 637          vec->operands[i] = Operand{elems[i]};
 638       }
 639       vec->definitions[0] = Definition(dst);
 640       ctx->block->instructions.emplace_back(std::move(vec));
 641       ctx->allocated_vec.emplace(dst.id(), elems);
 642       break;
 643    }
 644    case nir_op_mov: {
 645       Temp src = get_alu_src(ctx, instr->src[0]);
 646       aco_ptr<Instruction> mov;
 647       if (dst.type() == RegType::sgpr) {
 648          if (src.type() == RegType::vgpr)
 649             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
 650          else if (src.regClass() == s1)
 651             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 652          else if (src.regClass() == s2)
 653             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
 654          else
 655             unreachable("wrong src register class for nir_op_imov");
 656       } else if (dst.regClass() == v1) {
 657          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
 658       } else if (dst.regClass() == v2) {
 659          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
 660       } else {
 661          nir_print_instr(&instr->instr, stderr);
 662          unreachable("Should have been lowered to scalar.");
 663       }
 664       break;
 665    }
 666    case nir_op_inot: {
 667       Temp src = get_alu_src(ctx, instr->src[0]);
 668       /* uniform booleans */
 669       if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
 670          if (src.regClass() == s1) {
 671             /* in this case, src is either 1 or 0 */
 672             bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
 673          } else {
 674             /* src is either exec_mask or 0 */
 675             assert(src.regClass() == s2);
 676             bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
 677          }
 678       } else if (dst.regClass() == v1) {
 679          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
 680       } else if (dst.type() == RegType::sgpr) {
 681          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
 682          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
 683       } else {
 684          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 685          nir_print_instr(&instr->instr, stderr);
 686          fprintf(stderr, "\n");
 687       }
 688       break;
 689    }
 690    case nir_op_ineg: {
 691       Temp src = get_alu_src(ctx, instr->src[0]);
 692       if (dst.regClass() == v1) {
 693          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
 694       } else if (dst.regClass() == s1) {
 695          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
 696       } else if (dst.size() == 2) {
 697          Temp src0 = bld.tmp(dst.type(), 1);
 698          Temp src1 = bld.tmp(dst.type(), 1);
 699          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
 700
 701          if (dst.regClass() == s2) {
 702             Temp carry = bld.tmp(s1);
 703             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
 704             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
 705             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 706          } else {
 707             Temp lower = bld.tmp(v1);
 708             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
 709             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
 710             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 711          }
 712       } else {
 713          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 714          nir_print_instr(&instr->instr, stderr);
 715          fprintf(stderr, "\n");
 716       }
 717       break;
 718    }
 719    case nir_op_iabs: {
 720       if (dst.regClass() == s1) {
 721          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
 722       } else if (dst.regClass() == v1) {
 723          Temp src = get_alu_src(ctx, instr->src[0]);
 724          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
 725       } else {
 726          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 727          nir_print_instr(&instr->instr, stderr);
 728          fprintf(stderr, "\n");
 729       }
 730       break;
 731    }
 732    case nir_op_isign: {
 733       Temp src = get_alu_src(ctx, instr->src[0]);
 734       if (dst.regClass() == s1) {
 735          Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
 736          Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
 737          bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
 738       } else if (dst.regClass() == s2) {
 739          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
 740          Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
 741          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
 742       } else if (dst.regClass() == v1) {
 743          Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
 744          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 745          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
 746       } else if (dst.regClass() == v2) {
 747          Temp upper = emit_extract_vector(ctx, src, 1, v1);
 748          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
 749          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 750          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
 751          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
 752          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 753       } else {
 754          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 755          nir_print_instr(&instr->instr, stderr);
 756          fprintf(stderr, "\n");
 757       }
 758       break;
 759    }
 760    case nir_op_imax: {
 761       if (dst.regClass() == v1) {
 762          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
 763       } else if (dst.regClass() == s1) {
 764          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
 765       } else {
 766          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 767          nir_print_instr(&instr->instr, stderr);
 768          fprintf(stderr, "\n");
 769       }
 770       break;
 771    }
 772    case nir_op_umax: {
 773       if (dst.regClass() == v1) {
 774          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
 775       } else if (dst.regClass() == s1) {
 776          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
 777       } else {
 778          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 779          nir_print_instr(&instr->instr, stderr);
 780          fprintf(stderr, "\n");
 781       }
 782       break;
 783    }
 784    case nir_op_imin: {
 785       if (dst.regClass() == v1) {
 786          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
 787       } else if (dst.regClass() == s1) {
 788          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
 789       } else {
 790          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 791          nir_print_instr(&instr->instr, stderr);
 792          fprintf(stderr, "\n");
 793       }
 794       break;
 795    }
 796    case nir_op_umin: {
 797       if (dst.regClass() == v1) {
 798          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
 799       } else if (dst.regClass() == s1) {
 800          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
 801       } else {
 802          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 803          nir_print_instr(&instr->instr, stderr);
 804          fprintf(stderr, "\n");
 805       }
 806       break;
 807    }
 808    case nir_op_ior: {
 809       if (instr->dest.dest.ssa.bit_size == 1) {
 810          emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
 811       } else if (dst.regClass() == v1) {
 812          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
 813       } else if (dst.regClass() == s1) {
 814          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
 815       } else if (dst.regClass() == s2) {
 816          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
 817       } else {
 818          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 819          nir_print_instr(&instr->instr, stderr);
 820          fprintf(stderr, "\n");
 821       }
 822       break;
 823    }
 824    case nir_op_iand: {
 825       if (instr->dest.dest.ssa.bit_size == 1) {
 826          emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
 827       } else if (dst.regClass() == v1) {
 828          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
 829       } else if (dst.regClass() == s1) {
 830          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
 831       } else if (dst.regClass() == s2) {
 832          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
 833       } else {
 834          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 835          nir_print_instr(&instr->instr, stderr);
 836          fprintf(stderr, "\n");
 837       }
 838       break;
 839    }
 840    case nir_op_ixor: {
 841       if (instr->dest.dest.ssa.bit_size == 1) {
 842          emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
 843       } else if (dst.regClass() == v1) {
 844          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
 845       } else if (dst.regClass() == s1) {
 846          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
 847       } else if (dst.regClass() == s2) {
 848          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
 849       } else {
 850          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 851          nir_print_instr(&instr->instr, stderr);
 852          fprintf(stderr, "\n");
 853       }
 854       break;
 855    }
 856    case nir_op_ushr: {
 857       if (dst.regClass() == v1) {
 858          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
 859       } else if (dst.regClass() == v2) {
 860          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
 861                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 862       } else if (dst.regClass() == s2) {
 863          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
 864       } else if (dst.regClass() == s1) {
 865          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
 866       } else {
 867          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 868          nir_print_instr(&instr->instr, stderr);
 869          fprintf(stderr, "\n");
 870       }
 871       break;
 872    }
 873    case nir_op_ishl: {
 874       if (dst.regClass() == v1) {
 875          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
 876       } else if (dst.regClass() == v2) {
 877          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
 878                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 879       } else if (dst.regClass() == s1) {
 880          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
 881       } else if (dst.regClass() == s2) {
 882          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
 883       } else {
 884          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 885          nir_print_instr(&instr->instr, stderr);
 886          fprintf(stderr, "\n");
 887       }
 888       break;
 889    }
 890    case nir_op_ishr: {
 891       if (dst.regClass() == v1) {
 892          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
 893       } else if (dst.regClass() == v2) {
 894          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
 895                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 896       } else if (dst.regClass() == s1) {
 897          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
 898       } else if (dst.regClass() == s2) {
 899          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
 900       } else {
 901          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 902          nir_print_instr(&instr->instr, stderr);
 903          fprintf(stderr, "\n");
 904       }
 905       break;
 906    }
 907    case nir_op_find_lsb: {
 908       Temp src = get_alu_src(ctx, instr->src[0]);
 909       if (src.regClass() == s1) {
 910          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
 911       } else if (src.regClass() == v1) {
 912          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
 913       } else if (src.regClass() == s2) {
 914          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
 915       } else {
 916          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 917          nir_print_instr(&instr->instr, stderr);
 918          fprintf(stderr, "\n");
 919       }
 920       break;
 921    }
 922    case nir_op_ufind_msb:
 923    case nir_op_ifind_msb: {
 924       Temp src = get_alu_src(ctx, instr->src[0]);
 925       if (src.regClass() == s1 || src.regClass() == s2) {
 926          aco_opcode op = src.regClass() == s2 ?
 927                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
 928                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
 929          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
 930
 931          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
 932                                         Operand(src.size() * 32u - 1u), msb_rev);
 933          Temp msb = sub.def(0).getTemp();
 934          Temp carry = sub.def(1).getTemp();
 935
 936          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
 937       } else if (src.regClass() == v1) {
 938          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
 939          Temp msb_rev = bld.tmp(v1);
 940          emit_vop1_instruction(ctx, instr, op, msb_rev);
 941          Temp msb = bld.tmp(v1);
 942          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
 943          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
 944       } else {
 945          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 946          nir_print_instr(&instr->instr, stderr);
 947          fprintf(stderr, "\n");
 948       }
 949       break;
 950    }
 951    case nir_op_bitfield_reverse: {
 952       if (dst.regClass() == s1) {
 953          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 954       } else if (dst.regClass() == v1) {
 955          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 956       } else {
 957          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 958          nir_print_instr(&instr->instr, stderr);
 959          fprintf(stderr, "\n");
 960       }
 961       break;
 962    }
 963    case nir_op_iadd: {
 964       if (dst.regClass() == s1) {
 965          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
 966          break;
 967       }
 968
 969       Temp src0 = get_alu_src(ctx, instr->src[0]);
 970       Temp src1 = get_alu_src(ctx, instr->src[1]);
 971       if (dst.regClass() == v1) {
 972          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
 973          break;
 974       }
 975
 976       assert(src0.size() == 2 && src1.size() == 2);
 977       Temp src00 = bld.tmp(src0.type(), 1);
 978       Temp src01 = bld.tmp(dst.type(), 1);
 979       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 980       Temp src10 = bld.tmp(src1.type(), 1);
 981       Temp src11 = bld.tmp(dst.type(), 1);
 982       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 983
 984       if (dst.regClass() == s2) {
 985          Temp carry = bld.tmp(s1);
 986          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
 987          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
 988          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 989       } else if (dst.regClass() == v2) {
 990          Temp dst0 = bld.tmp(v1);
 991          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
 992          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
 993          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 994       } else {
 995          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 996          nir_print_instr(&instr->instr, stderr);
 997          fprintf(stderr, "\n");
 998       }
 999       break;
1000    }
1001    case nir_op_uadd_sat: {
1002       Temp src0 = get_alu_src(ctx, instr->src[0]);
1003       Temp src1 = get_alu_src(ctx, instr->src[1]);
1004       if (dst.regClass() == s1) {
1005          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1006          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1007                   src0, src1);
1008          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1009       } else if (dst.regClass() == v1) {
1010          if (ctx->options->chip_class >= GFX9) {
1011             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1012             add->operands[0] = Operand(src0);
1013             add->operands[1] = Operand(src1);
1014             add->definitions[0] = Definition(dst);
1015             add->clamp = 1;
1016             ctx->block->instructions.emplace_back(std::move(add));
1017          } else {
1018             if (src1.regClass() != v1)
1019                std::swap(src0, src1);
1020             assert(src1.regClass() == v1);
1021             Temp tmp = bld.tmp(v1);
1022             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1023             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1024          }
1025       } else {
1026          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1027          nir_print_instr(&instr->instr, stderr);
1028          fprintf(stderr, "\n");
1029       }
1030       break;
1031    }
1032    case nir_op_uadd_carry: {
1033       Temp src0 = get_alu_src(ctx, instr->src[0]);
1034       Temp src1 = get_alu_src(ctx, instr->src[1]);
1035       if (dst.regClass() == s1) {
1036          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1037          break;
1038       }
1039       if (dst.regClass() == v1) {
1040          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1041          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1042          break;
1043       }
1044
1045       Temp src00 = bld.tmp(src0.type(), 1);
1046       Temp src01 = bld.tmp(dst.type(), 1);
1047       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1048       Temp src10 = bld.tmp(src1.type(), 1);
1049       Temp src11 = bld.tmp(dst.type(), 1);
1050       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1051       if (dst.regClass() == s2) {
1052          Temp carry = bld.tmp(s1);
1053          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1054          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1055          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1056       } else if (dst.regClass() == v2) {
1057          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1058          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1059          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1060          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1061       } else {
1062          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1063          nir_print_instr(&instr->instr, stderr);
1064          fprintf(stderr, "\n");
1065       }
1066       break;
1067    }
1068    case nir_op_isub: {
1069       if (dst.regClass() == s1) {
1070          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1071          break;
1072       }
1073
1074       Temp src0 = get_alu_src(ctx, instr->src[0]);
1075       Temp src1 = get_alu_src(ctx, instr->src[1]);
1076       if (dst.regClass() == v1) {
1077          bld.vsub32(Definition(dst), src0, src1);
1078          break;
1079       }
1080
1081       Temp src00 = bld.tmp(src0.type(), 1);
1082       Temp src01 = bld.tmp(dst.type(), 1);
1083       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1084       Temp src10 = bld.tmp(src1.type(), 1);
1085       Temp src11 = bld.tmp(dst.type(), 1);
1086       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1087       if (dst.regClass() == s2) {
1088          Temp carry = bld.tmp(s1);
1089          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1090          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1091          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1092       } else if (dst.regClass() == v2) {
1093          Temp lower = bld.tmp(v1);
1094          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1095          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1096          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1097       } else {
1098          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1099          nir_print_instr(&instr->instr, stderr);
1100          fprintf(stderr, "\n");
1101       }
1102       break;
1103    }
1104    case nir_op_usub_borrow: {
1105       Temp src0 = get_alu_src(ctx, instr->src[0]);
1106       Temp src1 = get_alu_src(ctx, instr->src[1]);
1107       if (dst.regClass() == s1) {
1108          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1109          break;
1110       } else if (dst.regClass() == v1) {
1111          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1112          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1113          break;
1114       }
1115
1116       Temp src00 = bld.tmp(src0.type(), 1);
1117       Temp src01 = bld.tmp(dst.type(), 1);
1118       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1119       Temp src10 = bld.tmp(src1.type(), 1);
1120       Temp src11 = bld.tmp(dst.type(), 1);
1121       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1122       if (dst.regClass() == s2) {
1123          Temp borrow = bld.tmp(s1);
1124          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1125          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1126          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1127       } else if (dst.regClass() == v2) {
1128          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1129          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1130          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1131          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1132       } else {
1133          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1134          nir_print_instr(&instr->instr, stderr);
1135          fprintf(stderr, "\n");
1136       }
1137       break;
1138    }
1139    case nir_op_imul: {
1140       if (dst.regClass() == v1) {
1141          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1142                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1143       } else if (dst.regClass() == s1) {
1144          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1145       } else {
1146          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1147          nir_print_instr(&instr->instr, stderr);
1148          fprintf(stderr, "\n");
1149       }
1150       break;
1151    }
1152    case nir_op_umul_high: {
1153       if (dst.regClass() == v1) {
1154          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1155       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1156          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1157       } else if (dst.regClass() == s1) {
1158          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1159                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1160          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1161       } else {
1162          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1163          nir_print_instr(&instr->instr, stderr);
1164          fprintf(stderr, "\n");
1165       }
1166       break;
1167    }
1168    case nir_op_imul_high: {
1169       if (dst.regClass() == v1) {
1170          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1171       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1172          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1173       } else if (dst.regClass() == s1) {
1174          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1175                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1176          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1177       } else {
1178          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1179          nir_print_instr(&instr->instr, stderr);
1180          fprintf(stderr, "\n");
1181       }
1182       break;
1183    }
1184    case nir_op_fmul: {
1185       if (dst.size() == 1) {
1186          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1187       } else if (dst.size() == 2) {
1188          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1189                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1190       } else {
1191          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1192          nir_print_instr(&instr->instr, stderr);
1193          fprintf(stderr, "\n");
1194       }
1195       break;
1196    }
1197    case nir_op_fadd: {
1198       if (dst.size() == 1) {
1199          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1200       } else if (dst.size() == 2) {
1201          bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1202                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1203       } else {
1204          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1205          nir_print_instr(&instr->instr, stderr);
1206          fprintf(stderr, "\n");
1207       }
1208       break;
1209    }
1210    case nir_op_fsub: {
1211       Temp src0 = get_alu_src(ctx, instr->src[0]);
1212       Temp src1 = get_alu_src(ctx, instr->src[1]);
1213       if (dst.size() == 1) {
1214          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1215             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1216          else
1217             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1218       } else if (dst.size() == 2) {
1219          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1220                                      get_alu_src(ctx, instr->src[0]),
1221                                      as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1222          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1223          sub->neg[1] = true;
1224       } else {
1225          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1226          nir_print_instr(&instr->instr, stderr);
1227          fprintf(stderr, "\n");
1228       }
1229       break;
1230    }
1231    case nir_op_fmax: {
1232       if (dst.size() == 1) {
1233          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1234       } else if (dst.size() == 2) {
1235          bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1236                   get_alu_src(ctx, instr->src[0]),
1237                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1238       } else {
1239          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1240          nir_print_instr(&instr->instr, stderr);
1241          fprintf(stderr, "\n");
1242       }
1243       break;
1244    }
1245    case nir_op_fmin: {
1246       if (dst.size() == 1) {
1247          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1248       } else if (dst.size() == 2) {
1249          bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1250                   get_alu_src(ctx, instr->src[0]),
1251                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1252       } else {
1253          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1254          nir_print_instr(&instr->instr, stderr);
1255          fprintf(stderr, "\n");
1256       }
1257       break;
1258    }
1259    case nir_op_fmax3: {
1260       if (dst.size() == 1) {
1261          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1262       } else {
1263          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1264          nir_print_instr(&instr->instr, stderr);
1265          fprintf(stderr, "\n");
1266       }
1267       break;
1268    }
1269    case nir_op_fmin3: {
1270       if (dst.size() == 1) {
1271          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1272       } else {
1273          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1274          nir_print_instr(&instr->instr, stderr);
1275          fprintf(stderr, "\n");
1276       }
1277       break;
1278    }
1279    case nir_op_fmed3: {
1280       if (dst.size() == 1) {
1281          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1282       } else {
1283          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1284          nir_print_instr(&instr->instr, stderr);
1285          fprintf(stderr, "\n");
1286       }
1287       break;
1288    }
1289    case nir_op_umax3: {
1290       if (dst.size() == 1) {
1291          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1292       } else {
1293          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1294          nir_print_instr(&instr->instr, stderr);
1295          fprintf(stderr, "\n");
1296       }
1297       break;
1298    }
1299    case nir_op_umin3: {
1300       if (dst.size() == 1) {
1301          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1302       } else {
1303          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1304          nir_print_instr(&instr->instr, stderr);
1305          fprintf(stderr, "\n");
1306       }
1307       break;
1308    }
1309    case nir_op_umed3: {
1310       if (dst.size() == 1) {
1311          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1312       } else {
1313          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1314          nir_print_instr(&instr->instr, stderr);
1315          fprintf(stderr, "\n");
1316       }
1317       break;
1318    }
1319    case nir_op_imax3: {
1320       if (dst.size() == 1) {
1321          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1322       } else {
1323          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1324          nir_print_instr(&instr->instr, stderr);
1325          fprintf(stderr, "\n");
1326       }
1327       break;
1328    }
1329    case nir_op_imin3: {
1330       if (dst.size() == 1) {
1331          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1332       } else {
1333          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1334          nir_print_instr(&instr->instr, stderr);
1335          fprintf(stderr, "\n");
1336       }
1337       break;
1338    }
1339    case nir_op_imed3: {
1340       if (dst.size() == 1) {
1341          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1342       } else {
1343          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1344          nir_print_instr(&instr->instr, stderr);
1345          fprintf(stderr, "\n");
1346       }
1347       break;
1348    }
1349    case nir_op_cube_face_coord: {
1350       Temp in = get_alu_src(ctx, instr->src[0], 3);
1351       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1352                       emit_extract_vector(ctx, in, 1, v1),
1353                       emit_extract_vector(ctx, in, 2, v1) };
1354       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1355       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1356       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1357       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1358       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1359       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1360       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1361       break;
1362    }
1363    case nir_op_cube_face_index: {
1364       Temp in = get_alu_src(ctx, instr->src[0], 3);
1365       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1366                       emit_extract_vector(ctx, in, 1, v1),
1367                       emit_extract_vector(ctx, in, 2, v1) };
1368       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1369       break;
1370    }
1371    case nir_op_bcsel: {
1372       emit_bcsel(ctx, instr, dst);
1373       break;
1374    }
1375    case nir_op_frsq: {
1376       if (dst.size() == 1) {
1377          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
1378       } else if (dst.size() == 2) {
1379          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1380       } else {
1381          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1382          nir_print_instr(&instr->instr, stderr);
1383          fprintf(stderr, "\n");
1384       }
1385       break;
1386    }
1387    case nir_op_fneg: {
1388       Temp src = get_alu_src(ctx, instr->src[0]);
1389       if (dst.size() == 1) {
1390          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1391       } else if (dst.size() == 2) {
1392          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1393          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1394          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1395          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1396       } else {
1397          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1398          nir_print_instr(&instr->instr, stderr);
1399          fprintf(stderr, "\n");
1400       }
1401       break;
1402    }
1403    case nir_op_fabs: {
1404       Temp src = get_alu_src(ctx, instr->src[0]);
1405       if (dst.size() == 1) {
1406          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1407       } else if (dst.size() == 2) {
1408          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1409          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1410          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1411          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1412       } else {
1413          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1414          nir_print_instr(&instr->instr, stderr);
1415          fprintf(stderr, "\n");
1416       }
1417       break;
1418    }
1419    case nir_op_fsat: {
1420       Temp src = get_alu_src(ctx, instr->src[0]);
1421       if (dst.size() == 1) {
1422          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1423       } else if (dst.size() == 2) {
1424          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1425          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1426          vop3->clamp = true;
1427       } else {
1428          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1429          nir_print_instr(&instr->instr, stderr);
1430          fprintf(stderr, "\n");
1431       }
1432       break;
1433    }
1434    case nir_op_flog2: {
1435       if (dst.size() == 1) {
1436          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
1437       } else {
1438          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1439          nir_print_instr(&instr->instr, stderr);
1440          fprintf(stderr, "\n");
1441       }
1442       break;
1443    }
1444    case nir_op_frcp: {
1445       if (dst.size() == 1) {
1446          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
1447       } else if (dst.size() == 2) {
1448          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1449       } else {
1450          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1451          nir_print_instr(&instr->instr, stderr);
1452          fprintf(stderr, "\n");
1453       }
1454       break;
1455    }
1456    case nir_op_fexp2: {
1457       if (dst.size() == 1) {
1458          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1459       } else {
1460          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1461          nir_print_instr(&instr->instr, stderr);
1462          fprintf(stderr, "\n");
1463       }
1464       break;
1465    }
1466    case nir_op_fsqrt: {
1467       if (dst.size() == 1) {
1468          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
1469       } else if (dst.size() == 2) {
1470          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1471       } else {
1472          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1473          nir_print_instr(&instr->instr, stderr);
1474          fprintf(stderr, "\n");
1475       }
1476       break;
1477    }
1478    case nir_op_ffract: {
1479       if (dst.size() == 1) {
1480          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1481       } else if (dst.size() == 2) {
1482          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1483       } else {
1484          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1485          nir_print_instr(&instr->instr, stderr);
1486          fprintf(stderr, "\n");
1487       }
1488       break;
1489    }
1490    case nir_op_ffloor: {
1491       if (dst.size() == 1) {
1492          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1493       } else if (dst.size() == 2) {
1494          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1495       } else {
1496          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1497          nir_print_instr(&instr->instr, stderr);
1498          fprintf(stderr, "\n");
1499       }
1500       break;
1501    }
1502    case nir_op_fceil: {
1503       if (dst.size() == 1) {
1504          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1505       } else if (dst.size() == 2) {
1506          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1507       } else {
1508          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1509          nir_print_instr(&instr->instr, stderr);
1510          fprintf(stderr, "\n");
1511       }
1512       break;
1513    }
1514    case nir_op_ftrunc: {
1515       if (dst.size() == 1) {
1516          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1517       } else if (dst.size() == 2) {
1518          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1519       } else {
1520          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1521          nir_print_instr(&instr->instr, stderr);
1522          fprintf(stderr, "\n");
1523       }
1524       break;
1525    }
1526    case nir_op_fround_even: {
1527       if (dst.size() == 1) {
1528          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1529       } else if (dst.size() == 2) {
1530          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1531       } else {
1532          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1533          nir_print_instr(&instr->instr, stderr);
1534          fprintf(stderr, "\n");
1535       }
1536       break;
1537    }
1538    case nir_op_fsin:
1539    case nir_op_fcos: {
1540       Temp src = get_alu_src(ctx, instr->src[0]);
1541       aco_ptr<Instruction> norm;
1542       if (dst.size() == 1) {
1543          Temp tmp;
1544          Operand half_pi(0x3e22f983u);
1545          if (src.type() == RegType::sgpr)
1546             tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1547          else
1548             tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1549
1550          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1551          if (ctx->options->chip_class < GFX9)
1552             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1553
1554          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1555          bld.vop1(opcode, Definition(dst), tmp);
1556       } else {
1557          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1558          nir_print_instr(&instr->instr, stderr);
1559          fprintf(stderr, "\n");
1560       }
1561       break;
1562    }
1563    case nir_op_ldexp: {
1564       if (dst.size() == 1) {
1565          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1566                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1567                   get_alu_src(ctx, instr->src[1]));
1568       } else if (dst.size() == 2) {
1569          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1570                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1571                   get_alu_src(ctx, instr->src[1]));
1572       } else {
1573          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1574          nir_print_instr(&instr->instr, stderr);
1575          fprintf(stderr, "\n");
1576       }
1577       break;
1578    }
1579    case nir_op_frexp_sig: {
1580       if (dst.size() == 1) {
1581          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1582                   get_alu_src(ctx, instr->src[0]));
1583       } else if (dst.size() == 2) {
1584          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1585                   get_alu_src(ctx, instr->src[0]));
1586       } else {
1587          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1588          nir_print_instr(&instr->instr, stderr);
1589          fprintf(stderr, "\n");
1590       }
1591       break;
1592    }
1593    case nir_op_frexp_exp: {
1594       if (instr->src[0].src.ssa->bit_size == 32) {
1595          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1596                   get_alu_src(ctx, instr->src[0]));
1597       } else if (instr->src[0].src.ssa->bit_size == 64) {
1598          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1599                   get_alu_src(ctx, instr->src[0]));
1600       } else {
1601          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1602          nir_print_instr(&instr->instr, stderr);
1603          fprintf(stderr, "\n");
1604       }
1605       break;
1606    }
1607    case nir_op_fsign: {
1608       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1609       if (dst.size() == 1) {
1610          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1611          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1612          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1613          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1614       } else if (dst.size() == 2) {
1615          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1616          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1617          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond);
1618
1619          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1620          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1621          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1622
1623          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1624       } else {
1625          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1626          nir_print_instr(&instr->instr, stderr);
1627          fprintf(stderr, "\n");
1628       }
1629       break;
1630    }
1631    case nir_op_f2f32: {
1632       if (instr->src[0].src.ssa->bit_size == 64) {
1633          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1634       } else {
1635          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1636          nir_print_instr(&instr->instr, stderr);
1637          fprintf(stderr, "\n");
1638       }
1639       break;
1640    }
1641    case nir_op_f2f64: {
1642       if (instr->src[0].src.ssa->bit_size == 32) {
1643          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1644       } else {
1645          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1646          nir_print_instr(&instr->instr, stderr);
1647          fprintf(stderr, "\n");
1648       }
1649       break;
1650    }
1651    case nir_op_i2f32: {
1652       assert(dst.size() == 1);
1653       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1654       break;
1655    }
1656    case nir_op_i2f64: {
1657       if (instr->src[0].src.ssa->bit_size == 32) {
1658          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1659       } else if (instr->src[0].src.ssa->bit_size == 64) {
1660          Temp src = get_alu_src(ctx, instr->src[0]);
1661          RegClass rc = RegClass(src.type(), 1);
1662          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1663          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1664          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1665          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1666          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1667          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1668
1669       } else {
1670          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1671          nir_print_instr(&instr->instr, stderr);
1672          fprintf(stderr, "\n");
1673       }
1674       break;
1675    }
1676    case nir_op_u2f32: {
1677       assert(dst.size() == 1);
1678       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1679       break;
1680    }
1681    case nir_op_u2f64: {
1682       if (instr->src[0].src.ssa->bit_size == 32) {
1683          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1684       } else if (instr->src[0].src.ssa->bit_size == 64) {
1685          Temp src = get_alu_src(ctx, instr->src[0]);
1686          RegClass rc = RegClass(src.type(), 1);
1687          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1688          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1689          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1690          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1691          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1692          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1693       } else {
1694          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1695          nir_print_instr(&instr->instr, stderr);
1696          fprintf(stderr, "\n");
1697       }
1698       break;
1699    }
1700    case nir_op_f2i32: {
1701       Temp src = get_alu_src(ctx, instr->src[0]);
1702       if (instr->src[0].src.ssa->bit_size == 32) {
1703          if (dst.type() == RegType::vgpr)
1704             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1705          else
1706             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1707                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1708
1709       } else if (instr->src[0].src.ssa->bit_size == 64) {
1710          if (dst.type() == RegType::vgpr)
1711             bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1712          else
1713             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1714                        bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1715
1716       } else {
1717          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1718          nir_print_instr(&instr->instr, stderr);
1719          fprintf(stderr, "\n");
1720       }
1721       break;
1722    }
1723    case nir_op_f2u32: {
1724       Temp src = get_alu_src(ctx, instr->src[0]);
1725       if (instr->src[0].src.ssa->bit_size == 32) {
1726          if (dst.type() == RegType::vgpr)
1727             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1728          else
1729             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1730                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1731
1732       } else if (instr->src[0].src.ssa->bit_size == 64) {
1733          if (dst.type() == RegType::vgpr)
1734             bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1735          else
1736             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1737                        bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1738
1739       } else {
1740          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1741          nir_print_instr(&instr->instr, stderr);
1742          fprintf(stderr, "\n");
1743       }
1744       break;
1745    }
1746    case nir_op_f2i64: {
1747       Temp src = get_alu_src(ctx, instr->src[0]);
1748       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1749          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1750          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1751          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1752          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1753          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1754          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1755          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1756          Temp new_exponent = bld.tmp(v1);
1757          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1758          mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1759          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1760          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1761          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1762          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1763          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1764          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1765          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1766          Temp new_lower = bld.tmp(v1);
1767          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1768          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1769          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1770
1771       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1772          if (src.type() == RegType::vgpr)
1773             src = bld.as_uniform(src);
1774          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1775          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1776          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1777          exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1778          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1779          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1780          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1781          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1782          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1783          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1784          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1785          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1786          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1787          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1788          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1789          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1790          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1791          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1792          Temp borrow = bld.tmp(s1);
1793          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1794          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1795          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1796
1797       } else if (instr->src[0].src.ssa->bit_size == 64) {
1798          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1799          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1800          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1801          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1802          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1803          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1804          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1805          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1806          if (dst.type() == RegType::sgpr) {
1807             lower = bld.as_uniform(lower);
1808             upper = bld.as_uniform(upper);
1809          }
1810          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1811
1812       } else {
1813          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1814          nir_print_instr(&instr->instr, stderr);
1815          fprintf(stderr, "\n");
1816       }
1817       break;
1818    }
1819    case nir_op_f2u64: {
1820       Temp src = get_alu_src(ctx, instr->src[0]);
1821       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1822          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1823          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1824          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1825          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1826          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1827          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1828          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1829          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1830          Temp new_exponent = bld.tmp(v1);
1831          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1832          mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1833          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1834          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1835          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1836          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1837          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1838          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1839          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1840
1841       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1842          if (src.type() == RegType::vgpr)
1843             src = bld.as_uniform(src);
1844          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1845          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1846          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1847          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1848          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1849          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1850          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1851          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1852          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1853          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1854          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1855          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1856          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1857          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1858          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1859          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1860          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1861          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1862
1863       } else if (instr->src[0].src.ssa->bit_size == 64) {
1864          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1865          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1866          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1867          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1868          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1869          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1870          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1871          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1872          if (dst.type() == RegType::sgpr) {
1873             lower = bld.as_uniform(lower);
1874             upper = bld.as_uniform(upper);
1875          }
1876          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1877
1878       } else {
1879          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1880          nir_print_instr(&instr->instr, stderr);
1881          fprintf(stderr, "\n");
1882       }
1883       break;
1884    }
1885    case nir_op_b2f32: {
1886       Temp src = get_alu_src(ctx, instr->src[0]);
1887       if (dst.regClass() == s1) {
1888          src = as_uniform_bool(ctx, src);
1889          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1890       } else if (dst.regClass() == v1) {
1891          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
1892                       as_divergent_bool(ctx, src, true));
1893       } else {
1894          unreachable("Wrong destination register class for nir_op_b2f32.");
1895       }
1896       break;
1897    }
1898    case nir_op_b2f64: {
1899       Temp src = get_alu_src(ctx, instr->src[0]);
1900       if (dst.regClass() == s2) {
1901          src = as_uniform_bool(ctx, src);
1902          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1903       } else if (dst.regClass() == v2) {
1904          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1905          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
1906                       as_divergent_bool(ctx, src, true));
1907          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1908       } else {
1909          unreachable("Wrong destination register class for nir_op_b2f64.");
1910       }
1911       break;
1912    }
1913    case nir_op_i2i32: {
1914       Temp src = get_alu_src(ctx, instr->src[0]);
1915       if (instr->src[0].src.ssa->bit_size == 64) {
1916          /* we can actually just say dst = src, as it would map the lower register */
1917          emit_extract_vector(ctx, src, 0, dst);
1918       } else {
1919          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1920          nir_print_instr(&instr->instr, stderr);
1921          fprintf(stderr, "\n");
1922       }
1923       break;
1924    }
1925    case nir_op_u2u32: {
1926       Temp src = get_alu_src(ctx, instr->src[0]);
1927       if (instr->src[0].src.ssa->bit_size == 16) {
1928          if (dst.regClass() == s1) {
1929             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
1930          } else {
1931             // TODO: do better with SDWA
1932             bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
1933          }
1934       } else if (instr->src[0].src.ssa->bit_size == 64) {
1935          /* we can actually just say dst = src, as it would map the lower register */
1936          emit_extract_vector(ctx, src, 0, dst);
1937       } else {
1938          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1939          nir_print_instr(&instr->instr, stderr);
1940          fprintf(stderr, "\n");
1941       }
1942       break;
1943    }
1944    case nir_op_i2i64: {
1945       Temp src = get_alu_src(ctx, instr->src[0]);
1946       if (instr->src[0].src.ssa->bit_size == 32) {
1947          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1948       } else {
1949          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1950          nir_print_instr(&instr->instr, stderr);
1951          fprintf(stderr, "\n");
1952       }
1953       break;
1954    }
1955    case nir_op_u2u64: {
1956       Temp src = get_alu_src(ctx, instr->src[0]);
1957       if (instr->src[0].src.ssa->bit_size == 32) {
1958          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1959       } else {
1960          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1961          nir_print_instr(&instr->instr, stderr);
1962          fprintf(stderr, "\n");
1963       }
1964       break;
1965    }
1966    case nir_op_b2i32: {
1967       Temp src = get_alu_src(ctx, instr->src[0]);
1968       if (dst.regClass() == s1) {
1969          if (src.regClass() == s1) {
1970             bld.copy(Definition(dst), src);
1971          } else {
1972             // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
1973             assert(src.regClass() == s2);
1974             bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
1975          }
1976       } else {
1977          assert(dst.regClass() == v1 && src.regClass() == s2);
1978          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
1979       }
1980       break;
1981    }
1982    case nir_op_i2b1: {
1983       Temp src = get_alu_src(ctx, instr->src[0]);
1984       if (dst.regClass() == s2) {
1985          assert(src.regClass() == v1 || src.regClass() == v2);
1986          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
1987                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
1988       } else {
1989          assert(src.regClass() == s1 && dst.regClass() == s1);
1990          bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
1991       }
1992       break;
1993    }
1994    case nir_op_pack_64_2x32_split: {
1995       Temp src0 = get_alu_src(ctx, instr->src[0]);
1996       Temp src1 = get_alu_src(ctx, instr->src[1]);
1997
1998       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
1999       break;
2000    }
2001    case nir_op_unpack_64_2x32_split_x:
2002       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2003       break;
2004    case nir_op_unpack_64_2x32_split_y:
2005       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2006       break;
2007    case nir_op_pack_half_2x16: {
2008       Temp src = get_alu_src(ctx, instr->src[0], 2);
2009
2010       if (dst.regClass() == v1) {
2011          Temp src0 = bld.tmp(v1);
2012          Temp src1 = bld.tmp(v1);
2013          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2014          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2015
2016       } else {
2017          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2018          nir_print_instr(&instr->instr, stderr);
2019          fprintf(stderr, "\n");
2020       }
2021       break;
2022    }
2023    case nir_op_unpack_half_2x16_split_x: {
2024       if (dst.regClass() == v1) {
2025          Builder bld(ctx->program, ctx->block);
2026          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2027       } else {
2028          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2029          nir_print_instr(&instr->instr, stderr);
2030          fprintf(stderr, "\n");
2031       }
2032       break;
2033    }
2034    case nir_op_unpack_half_2x16_split_y: {
2035       if (dst.regClass() == v1) {
2036          Builder bld(ctx->program, ctx->block);
2037          /* TODO: use SDWA here */
2038          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2039                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2040       } else {
2041          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2042          nir_print_instr(&instr->instr, stderr);
2043          fprintf(stderr, "\n");
2044       }
2045       break;
2046    }
2047    case nir_op_fquantize2f16: {
2048       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
2049
2050       Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2051
2052       Temp cmp_res = bld.tmp(s2);
2053       bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2054
2055       Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2056
2057       bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2058       break;
2059    }
2060    case nir_op_bfm: {
2061       Temp bits = get_alu_src(ctx, instr->src[0]);
2062       Temp offset = get_alu_src(ctx, instr->src[1]);
2063
2064       if (dst.regClass() == s1) {
2065          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2066       } else if (dst.regClass() == v1) {
2067          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2068       } else {
2069          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2070          nir_print_instr(&instr->instr, stderr);
2071          fprintf(stderr, "\n");
2072       }
2073       break;
2074    }
2075    case nir_op_bitfield_select: {
2076       /* (mask & insert) | (~mask & base) */
2077       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2078       Temp insert = get_alu_src(ctx, instr->src[1]);
2079       Temp base = get_alu_src(ctx, instr->src[2]);
2080
2081       /* dst = (insert & bitmask) | (base & ~bitmask) */
2082       if (dst.regClass() == s1) {
2083          aco_ptr<Instruction> sop2;
2084          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2085          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2086          Operand lhs;
2087          if (const_insert && const_bitmask) {
2088             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2089          } else {
2090             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2091             lhs = Operand(insert);
2092          }
2093
2094          Operand rhs;
2095          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2096          if (const_base && const_bitmask) {
2097             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2098          } else {
2099             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2100             rhs = Operand(base);
2101          }
2102
2103          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2104
2105       } else if (dst.regClass() == v1) {
2106          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2107             base = as_vgpr(ctx, base);
2108          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2109             insert = as_vgpr(ctx, insert);
2110
2111          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2112
2113       } else {
2114          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2115          nir_print_instr(&instr->instr, stderr);
2116          fprintf(stderr, "\n");
2117       }
2118       break;
2119    }
2120    case nir_op_ubfe:
2121    case nir_op_ibfe: {
2122       Temp base = get_alu_src(ctx, instr->src[0]);
2123       Temp offset = get_alu_src(ctx, instr->src[1]);
2124       Temp bits = get_alu_src(ctx, instr->src[2]);
2125
2126       if (dst.type() == RegType::sgpr) {
2127          Operand extract;
2128          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2129          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2130          if (const_offset && const_bits) {
2131             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2132             extract = Operand(const_extract);
2133          } else {
2134             Operand width;
2135             if (const_bits) {
2136                width = Operand(const_bits->u32 << 16);
2137             } else {
2138                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2139             }
2140             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2141          }
2142
2143          aco_opcode opcode;
2144          if (dst.regClass() == s1) {
2145             if (instr->op == nir_op_ubfe)
2146                opcode = aco_opcode::s_bfe_u32;
2147             else
2148                opcode = aco_opcode::s_bfe_i32;
2149          } else if (dst.regClass() == s2) {
2150             if (instr->op == nir_op_ubfe)
2151                opcode = aco_opcode::s_bfe_u64;
2152             else
2153                opcode = aco_opcode::s_bfe_i64;
2154          } else {
2155             unreachable("Unsupported BFE bit size");
2156          }
2157
2158          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2159
2160       } else {
2161          aco_opcode opcode;
2162          if (dst.regClass() == v1) {
2163             if (instr->op == nir_op_ubfe)
2164                opcode = aco_opcode::v_bfe_u32;
2165             else
2166                opcode = aco_opcode::v_bfe_i32;
2167          } else {
2168             unreachable("Unsupported BFE bit size");
2169          }
2170
2171          emit_vop3a_instruction(ctx, instr, opcode, dst);
2172       }
2173       break;
2174    }
2175    case nir_op_bit_count: {
2176       Temp src = get_alu_src(ctx, instr->src[0]);
2177       if (src.regClass() == s1) {
2178          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2179       } else if (src.regClass() == v1) {
2180          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2181       } else if (src.regClass() == v2) {
2182          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2183                   emit_extract_vector(ctx, src, 1, v1),
2184                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2185                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2186       } else if (src.regClass() == s2) {
2187          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2188       } else {
2189          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2190          nir_print_instr(&instr->instr, stderr);
2191          fprintf(stderr, "\n");
2192       }
2193       break;
2194    }
2195    case nir_op_flt: {
2196       if (instr->src[0].src.ssa->bit_size == 32)
2197          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
2198       else if (instr->src[0].src.ssa->bit_size == 64)
2199          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
2200       break;
2201    }
2202    case nir_op_fge: {
2203       if (instr->src[0].src.ssa->bit_size == 32)
2204          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
2205       else if (instr->src[0].src.ssa->bit_size == 64)
2206          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
2207       break;
2208    }
2209    case nir_op_feq: {
2210       if (instr->src[0].src.ssa->bit_size == 32)
2211          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
2212       else if (instr->src[0].src.ssa->bit_size == 64)
2213          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
2214       break;
2215    }
2216    case nir_op_fne: {
2217       if (instr->src[0].src.ssa->bit_size == 32)
2218          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
2219       else if (instr->src[0].src.ssa->bit_size == 64)
2220          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
2221       break;
2222    }
2223    case nir_op_ilt: {
2224       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2225          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
2226       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2227          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
2228       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2229          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
2230       break;
2231    }
2232    case nir_op_ige: {
2233       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2234          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
2235       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2236          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
2237       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2238          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
2239       break;
2240    }
2241    case nir_op_ieq: {
2242       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2243          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
2244       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2245          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
2246       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2247          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
2248       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2249          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
2250       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2251          Temp src0 = get_alu_src(ctx, instr->src[0]);
2252          Temp src1 = get_alu_src(ctx, instr->src[1]);
2253          bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
2254                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2255       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2256          Temp src0 = get_alu_src(ctx, instr->src[0]);
2257          Temp src1 = get_alu_src(ctx, instr->src[1]);
2258          bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
2259                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2260       } else {
2261          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2262          nir_print_instr(&instr->instr, stderr);
2263          fprintf(stderr, "\n");
2264       }
2265       break;
2266    }
2267    case nir_op_ine: {
2268       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2269          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
2270       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2271          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
2272       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2273          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
2274       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2275          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
2276       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2277          Temp src0 = get_alu_src(ctx, instr->src[0]);
2278          Temp src1 = get_alu_src(ctx, instr->src[1]);
2279          bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
2280                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2281       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2282          Temp src0 = get_alu_src(ctx, instr->src[0]);
2283          Temp src1 = get_alu_src(ctx, instr->src[1]);
2284          bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
2285                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2286       } else {
2287          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2288          nir_print_instr(&instr->instr, stderr);
2289          fprintf(stderr, "\n");
2290       }
2291       break;
2292    }
2293    case nir_op_ult: {
2294       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2295          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
2296       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2297          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
2298       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2299          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
2300       break;
2301    }
2302    case nir_op_uge: {
2303       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2304          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
2305       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2306          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
2307       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2308          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
2309       break;
2310    }
2311    case nir_op_fddx:
2312    case nir_op_fddy:
2313    case nir_op_fddx_fine:
2314    case nir_op_fddy_fine:
2315    case nir_op_fddx_coarse:
2316    case nir_op_fddy_coarse: {
2317       Definition tl = bld.def(v1);
2318       uint16_t dpp_ctrl;
2319       if (instr->op == nir_op_fddx_fine) {
2320          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2321          dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2322       } else if (instr->op == nir_op_fddy_fine) {
2323          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2324          dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2325       } else {
2326          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2327          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2328             dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2329          else
2330             dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2331       }
2332
2333       Definition tmp = bld.def(v1);
2334       bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2335       emit_wqm(ctx, tmp.getTemp(), dst, true);
2336       break;
2337    }
2338    default:
2339       fprintf(stderr, "Unknown NIR ALU instr: ");
2340       nir_print_instr(&instr->instr, stderr);
2341       fprintf(stderr, "\n");
2342    }
2343 }
2344
2345 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2346 {
2347    Temp dst = get_ssa_temp(ctx, &instr->def);
2348
2349    // TODO: we really want to have the resulting type as this would allow for 64bit literals
2350    // which get truncated the lsb if double and msb if int
2351    // for now, we only use s_mov_b64 with 64bit inline constants
2352    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2353    assert(dst.type() == RegType::sgpr);
2354
2355    if (dst.size() == 1)
2356    {
2357       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
2358    } else {
2359       assert(dst.size() != 1);
2360       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2361       if (instr->def.bit_size == 64)
2362          for (unsigned i = 0; i < dst.size(); i++)
2363             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2364       else {
2365          for (unsigned i = 0; i < dst.size(); i++)
2366             vec->operands[i] = Operand{instr->value[i].u32};
2367       }
2368       vec->definitions[0] = Definition(dst);
2369       ctx->block->instructions.emplace_back(std::move(vec));
2370    }
2371 }
2372
2373 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2374 {
2375    uint32_t new_mask = 0;
2376    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2377       if (mask & (1u << i))
2378          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2379    return new_mask;
2380 }
2381
2382 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2383 {
2384    /* This wouldn't work inside control flow or with indirect offsets but
2385     * that doesn't happen because of nir_lower_io_to_temporaries(). */
2386
2387    unsigned write_mask = nir_intrinsic_write_mask(instr);
2388    unsigned component = nir_intrinsic_component(instr);
2389    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2390    unsigned idx = nir_intrinsic_base(instr) + component;
2391
2392    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2393    if (off_instr->type != nir_instr_type_load_const) {
2394       fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2395       nir_print_instr(off_instr, stderr);
2396       fprintf(stderr, "\n");
2397    }
2398    idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2399
2400    if (instr->src[0].ssa->bit_size == 64)
2401       write_mask = widen_mask(write_mask, 2);
2402
2403    for (unsigned i = 0; i < 8; ++i) {
2404       if (write_mask & (1 << i)) {
2405          ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2406          ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2407       }
2408       idx++;
2409    }
2410 }
2411
2412 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2413 {
2414    unsigned write_mask = nir_intrinsic_write_mask(instr);
2415    Operand values[4];
2416    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2417    for (unsigned i = 0; i < 4; ++i) {
2418       if (write_mask & (1 << i)) {
2419          Temp tmp = emit_extract_vector(ctx, src, i, v1);
2420          values[i] = Operand(tmp);
2421       } else {
2422          values[i] = Operand(v1);
2423       }
2424    }
2425
2426    unsigned index = nir_intrinsic_base(instr) / 4;
2427    unsigned target, col_format;
2428    unsigned enabled_channels = 0xF;
2429    aco_opcode compr_op = (aco_opcode)0;
2430
2431    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2432    assert(offset && "Non-const offsets on exports not yet supported");
2433    index += offset->u32;
2434
2435    assert(index != FRAG_RESULT_COLOR);
2436
2437    /* Unlike vertex shader exports, it's fine to use multiple exports to
2438     * export separate channels of one target. So shaders which export both
2439     * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2440     * TODO: combine the exports in those cases and create better code
2441     */
2442
2443    if (index == FRAG_RESULT_SAMPLE_MASK) {
2444
2445       if (ctx->program->info->ps.writes_z) {
2446          target = V_008DFC_SQ_EXP_MRTZ;
2447          enabled_channels = 0x4;
2448          col_format = (unsigned) -1;
2449
2450          values[2] = values[0];
2451          values[0] = Operand(v1);
2452       } else {
2453          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2454          exp->valid_mask = false;
2455          exp->done = false;
2456          exp->compressed = true;
2457          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2458          exp->enabled_mask = 0xc;
2459          for (int i = 0; i < 4; i++)
2460             exp->operands[i] = Operand(v1);
2461          exp->operands[1] = Operand(values[0]);
2462          ctx->block->instructions.emplace_back(std::move(exp));
2463          return;
2464       }
2465
2466    } else if (index == FRAG_RESULT_DEPTH) {
2467
2468       target = V_008DFC_SQ_EXP_MRTZ;
2469       enabled_channels = 0x1;
2470       col_format = (unsigned) -1;
2471
2472    } else if (index == FRAG_RESULT_STENCIL) {
2473
2474       if (ctx->program->info->ps.writes_z) {
2475          target = V_008DFC_SQ_EXP_MRTZ;
2476          enabled_channels = 0x2;
2477          col_format = (unsigned) -1;
2478
2479          values[1] = values[0];
2480          values[0] = Operand(v1);
2481       } else {
2482          aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2483          shift->operands[0] = Operand((uint32_t) 16);
2484          shift->operands[1] = values[0];
2485          Temp tmp = {ctx->program->allocateId(), v1};
2486          shift->definitions[0] = Definition(tmp);
2487          ctx->block->instructions.emplace_back(std::move(shift));
2488
2489          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2490          exp->valid_mask = false;
2491          exp->done = false;
2492          exp->compressed = true;
2493          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2494          exp->enabled_mask = 0x3;
2495          exp->operands[0] = Operand(tmp);
2496          for (int i = 1; i < 4; i++)
2497             exp->operands[i] = Operand(v1);
2498          ctx->block->instructions.emplace_back(std::move(exp));
2499          return;
2500       }
2501
2502    } else {
2503       index -= FRAG_RESULT_DATA0;
2504       target = V_008DFC_SQ_EXP_MRT + index;
2505       col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2506    }
2507    ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2508    ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2509    assert(!is_int8 && !is_int10);
2510
2511    switch (col_format)
2512    {
2513    case V_028714_SPI_SHADER_ZERO:
2514       enabled_channels = 0; /* writemask */
2515       target = V_008DFC_SQ_EXP_NULL;
2516       break;
2517
2518    case V_028714_SPI_SHADER_32_R:
2519       enabled_channels = 1;
2520       break;
2521
2522    case V_028714_SPI_SHADER_32_GR:
2523       enabled_channels = 0x3;
2524       break;
2525
2526    case V_028714_SPI_SHADER_32_AR:
2527       enabled_channels = 0x9;
2528       break;
2529
2530    case V_028714_SPI_SHADER_FP16_ABGR:
2531       enabled_channels = 0x5;
2532       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2533       break;
2534
2535    case V_028714_SPI_SHADER_UNORM16_ABGR:
2536       enabled_channels = 0x5;
2537       compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2538       break;
2539
2540    case V_028714_SPI_SHADER_SNORM16_ABGR:
2541       enabled_channels = 0x5;
2542       compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2543       break;
2544
2545    case V_028714_SPI_SHADER_UINT16_ABGR:
2546       enabled_channels = 0x5;
2547       compr_op = aco_opcode::v_cvt_pk_u16_u32;
2548       break;
2549
2550    case V_028714_SPI_SHADER_SINT16_ABGR:
2551       enabled_channels = 0x5;
2552       compr_op = aco_opcode::v_cvt_pk_i16_i32;
2553       break;
2554
2555    case V_028714_SPI_SHADER_32_ABGR:
2556       enabled_channels = 0xF;
2557       break;
2558
2559    default:
2560       break;
2561    }
2562
2563    if (target == V_008DFC_SQ_EXP_NULL)
2564       return;
2565
2566    if ((bool)compr_op)
2567    {
2568       for (int i = 0; i < 2; i++)
2569       {
2570          /* check if at least one of the values to be compressed is enabled */
2571          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2572          if (enabled) {
2573             enabled_channels |= enabled << (i*2);
2574             aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2575             Temp tmp{ctx->program->allocateId(), v1};
2576             compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2577             compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2578             compr->definitions[0] = Definition(tmp);
2579             values[i] = Operand(tmp);
2580             ctx->block->instructions.emplace_back(std::move(compr));
2581          } else {
2582             values[i] = Operand(v1);
2583          }
2584       }
2585    }
2586
2587    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2588    exp->valid_mask = false;
2589    exp->done = false;
2590    exp->compressed = (bool) compr_op;
2591    exp->dest = target;
2592    exp->enabled_mask = enabled_channels;
2593    if ((bool) compr_op) {
2594       for (int i = 0; i < 2; i++)
2595          exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2596       exp->operands[2] = Operand(v1);
2597       exp->operands[3] = Operand(v1);
2598    } else {
2599       for (int i = 0; i < 4; i++)
2600          exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2601    }
2602
2603    ctx->block->instructions.emplace_back(std::move(exp));
2604 }
2605
2606 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2607 {
2608    if (ctx->stage == vertex_vs) {
2609       visit_store_vs_output(ctx, instr);
2610    } else if (ctx->stage == fragment_fs) {
2611       visit_store_fs_output(ctx, instr);
2612    } else {
2613       unreachable("Shader stage not implemented");
2614    }
2615 }
2616
2617 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2618 {
2619    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2620    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2621
2622    Builder bld(ctx->program, ctx->block);
2623    Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2624    bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2625 }
2626
2627 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2628 {
2629    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2630    for (unsigned i = 0; i < num_components; i++)
2631       vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
2632
2633    if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
2634       assert(num_components == 4);
2635       Builder bld(ctx->program, ctx->block);
2636       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
2637    }
2638
2639    for (Operand& op : vec->operands)
2640       op = op.isUndefined() ? Operand(0u) : op;
2641
2642    vec->definitions[0] = Definition(dst);
2643    ctx->block->instructions.emplace_back(std::move(vec));
2644    emit_split_vector(ctx, dst, num_components);
2645    return;
2646 }
2647
2648 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2649 {
2650    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2651    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2652    unsigned idx = nir_intrinsic_base(instr);
2653    unsigned component = nir_intrinsic_component(instr);
2654    Temp prim_mask = ctx->prim_mask;
2655
2656    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2657    if (offset) {
2658       assert(offset->u32 == 0);
2659    } else {
2660       /* the lower 15bit of the prim_mask contain the offset into LDS
2661        * while the upper bits contain the number of prims */
2662       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2663       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2664       Builder bld(ctx->program, ctx->block);
2665       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2666       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2667       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2668       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2669       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2670    }
2671
2672    if (instr->dest.ssa.num_components == 1) {
2673       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2674    } else {
2675       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2676       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2677       {
2678          Temp tmp = {ctx->program->allocateId(), v1};
2679          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2680          vec->operands[i] = Operand(tmp);
2681       }
2682       vec->definitions[0] = Definition(dst);
2683       ctx->block->instructions.emplace_back(std::move(vec));
2684    }
2685 }
2686
2687 unsigned get_num_channels_from_data_format(unsigned data_format)
2688 {
2689    switch (data_format) {
2690    case V_008F0C_BUF_DATA_FORMAT_8:
2691    case V_008F0C_BUF_DATA_FORMAT_16:
2692    case V_008F0C_BUF_DATA_FORMAT_32:
2693       return 1;
2694    case V_008F0C_BUF_DATA_FORMAT_8_8:
2695    case V_008F0C_BUF_DATA_FORMAT_16_16:
2696    case V_008F0C_BUF_DATA_FORMAT_32_32:
2697       return 2;
2698    case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2699    case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2700    case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2701       return 3;
2702    case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2703    case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2704    case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2705    case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2706    case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2707       return 4;
2708    default:
2709       break;
2710    }
2711
2712    return 4;
2713 }
2714
2715 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2716  * so we may need to fix it up. */
2717 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
2718 {
2719    Builder bld(ctx->program, ctx->block);
2720
2721    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
2722       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
2723
2724    /* For the integer-like cases, do a natural sign extension.
2725     *
2726     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2727     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2728     * exponent.
2729     */
2730    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
2731    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
2732
2733    /* Convert back to the right type. */
2734    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
2735       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2736       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
2737       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
2738    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
2739       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2740    }
2741
2742    return alpha;
2743 }
2744
2745 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
2746 {
2747    Builder bld(ctx->program, ctx->block);
2748    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2749    if (ctx->stage & sw_vs) {
2750
2751       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2752       if (off_instr->type != nir_instr_type_load_const) {
2753          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2754          nir_print_instr(off_instr, stderr);
2755          fprintf(stderr, "\n");
2756       }
2757       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
2758
2759       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
2760
2761       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
2762       unsigned component = nir_intrinsic_component(instr);
2763       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
2764       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
2765       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
2766       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
2767
2768       unsigned dfmt = attrib_format & 0xf;
2769
2770       unsigned nfmt = (attrib_format >> 4) & 0x7;
2771       unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
2772       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
2773       unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
2774       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
2775       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
2776       if (post_shuffle)
2777          num_channels = MAX2(num_channels, 3);
2778
2779       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
2780
2781       Temp index;
2782       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
2783          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
2784          if (divisor) {
2785             ctx->needs_instance_id = true;
2786
2787             if (divisor != 1) {
2788                Temp divided = bld.tmp(v1);
2789                emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
2790                index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
2791             } else {
2792                index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
2793             }
2794          } else {
2795             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
2796          }
2797       } else {
2798          index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
2799       }
2800
2801       if (attrib_stride != 0 && attrib_offset > attrib_stride) {
2802          index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
2803          attrib_offset = attrib_offset % attrib_stride;
2804       }
2805
2806       Operand soffset(0u);
2807       if (attrib_offset >= 4096) {
2808          soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
2809          attrib_offset = 0;
2810       }
2811
2812       aco_opcode opcode;
2813       switch (num_channels) {
2814       case 1:
2815          opcode = aco_opcode::tbuffer_load_format_x;
2816          break;
2817       case 2:
2818          opcode = aco_opcode::tbuffer_load_format_xy;
2819          break;
2820       case 3:
2821          opcode = aco_opcode::tbuffer_load_format_xyz;
2822          break;
2823       case 4:
2824          opcode = aco_opcode::tbuffer_load_format_xyzw;
2825          break;
2826       default:
2827          unreachable("Unimplemented load_input vector size");
2828       }
2829
2830       Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
2831
2832       aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
2833       mubuf->operands[0] = Operand(index);
2834       mubuf->operands[1] = Operand(list);
2835       mubuf->operands[2] = soffset;
2836       mubuf->definitions[0] = Definition(tmp);
2837       mubuf->idxen = true;
2838       mubuf->can_reorder = true;
2839       mubuf->dfmt = dfmt;
2840       mubuf->nfmt = nfmt;
2841       assert(attrib_offset < 4096);
2842       mubuf->offset = attrib_offset;
2843       ctx->block->instructions.emplace_back(std::move(mubuf));
2844
2845       emit_split_vector(ctx, tmp, tmp.size());
2846
2847       if (tmp.id() != dst.id()) {
2848          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
2849                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
2850
2851          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
2852          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
2853          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
2854
2855          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2856          for (unsigned i = 0; i < dst.size(); i++) {
2857             unsigned idx = i + component;
2858             if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
2859                Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
2860                vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
2861             } else if (idx < num_channels) {
2862                vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
2863             } else if (is_float && idx == 3) {
2864                vec->operands[i] = Operand(0x3f800000u);
2865             } else if (!is_float && idx == 3) {
2866                vec->operands[i] = Operand(1u);
2867             } else {
2868                vec->operands[i] = Operand(0u);
2869             }
2870          }
2871          vec->definitions[0] = Definition(dst);
2872          ctx->block->instructions.emplace_back(std::move(vec));
2873          emit_split_vector(ctx, dst, dst.size());
2874       }
2875
2876    } else if (ctx->stage == fragment_fs) {
2877       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2878       if (off_instr->type != nir_instr_type_load_const ||
2879           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
2880          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2881          nir_print_instr(off_instr, stderr);
2882          fprintf(stderr, "\n");
2883       }
2884
2885       Temp prim_mask = ctx->prim_mask;
2886       nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
2887       if (offset) {
2888          assert(offset->u32 == 0);
2889       } else {
2890          /* the lower 15bit of the prim_mask contain the offset into LDS
2891           * while the upper bits contain the number of prims */
2892          Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
2893          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2894          Builder bld(ctx->program, ctx->block);
2895          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2896          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2897          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2898          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2899          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2900       }
2901
2902       unsigned idx = nir_intrinsic_base(instr);
2903       unsigned component = nir_intrinsic_component(instr);
2904
2905       if (dst.size() == 1) {
2906          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
2907       } else {
2908          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2909          for (unsigned i = 0; i < dst.size(); i++)
2910             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
2911          vec->definitions[0] = Definition(dst);
2912          bld.insert(std::move(vec));
2913       }
2914
2915    } else {
2916       unreachable("Shader stage not implemented");
2917    }
2918 }
2919
2920 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
2921 {
2922    if (ctx->program->info->need_indirect_descriptor_sets) {
2923       Builder bld(ctx->program, ctx->block);
2924       Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
2925       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
2926    }
2927
2928    return ctx->descriptor_sets[desc_set];
2929 }
2930
2931
2932 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
2933 {
2934    Builder bld(ctx->program, ctx->block);
2935    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
2936    unsigned desc_set = nir_intrinsic_desc_set(instr);
2937    unsigned binding = nir_intrinsic_binding(instr);
2938
2939    Temp desc_ptr;
2940    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
2941    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
2942    unsigned offset = layout->binding[binding].offset;
2943    unsigned stride;
2944    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
2945        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
2946       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
2947       desc_ptr = ctx->push_constants;
2948       offset = pipeline_layout->push_constant_size + 16 * idx;
2949       stride = 16;
2950    } else {
2951       desc_ptr = load_desc_ptr(ctx, desc_set);
2952       stride = layout->binding[binding].size;
2953    }
2954
2955    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
2956    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
2957    if (stride != 1) {
2958       if (nir_const_index) {
2959          const_index = const_index * stride;
2960       } else {
2961          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
2962       }
2963    }
2964    if (offset) {
2965       if (nir_const_index) {
2966          const_index = const_index + offset;
2967       } else {
2968          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
2969       }
2970    }
2971
2972    if (nir_const_index && const_index == 0) {
2973       index = desc_ptr;
2974    } else {
2975       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
2976                        nir_const_index ? Operand(const_index) : Operand(index),
2977                        Operand(desc_ptr));
2978    }
2979
2980    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2981    bld.sop1(aco_opcode::s_mov_b32, Definition(dst), index);
2982 }
2983
2984 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false)
2985 {
2986    Builder bld(ctx->program, ctx->block);
2987
2988    unsigned num_bytes = dst.size() * 4;
2989    bool dlc = glc && ctx->options->chip_class >= GFX10;
2990
2991    aco_opcode op;
2992    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
2993       if (ctx->options->chip_class < GFX8)
2994          offset = as_vgpr(ctx, offset);
2995
2996       Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
2997       Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
2998       unsigned const_offset = 0;
2999
3000       Temp lower = Temp();
3001       if (num_bytes > 16) {
3002          assert(num_components == 3 || num_components == 4);
3003          op = aco_opcode::buffer_load_dwordx4;
3004          lower = bld.tmp(v4);
3005          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3006          mubuf->definitions[0] = Definition(lower);
3007          mubuf->operands[0] = vaddr;
3008          mubuf->operands[1] = Operand(rsrc);
3009          mubuf->operands[2] = soffset;
3010          mubuf->offen = (offset.type() == RegType::vgpr);
3011          mubuf->glc = glc;
3012          mubuf->dlc = dlc;
3013          mubuf->barrier = barrier_buffer;
3014          bld.insert(std::move(mubuf));
3015          emit_split_vector(ctx, lower, 2);
3016          num_bytes -= 16;
3017          const_offset = 16;
3018       }
3019
3020       switch (num_bytes) {
3021          case 4:
3022             op = aco_opcode::buffer_load_dword;
3023             break;
3024          case 8:
3025             op = aco_opcode::buffer_load_dwordx2;
3026             break;
3027          case 12:
3028             op = aco_opcode::buffer_load_dwordx3;
3029             break;
3030          case 16:
3031             op = aco_opcode::buffer_load_dwordx4;
3032             break;
3033          default:
3034             unreachable("Load SSBO not implemented for this size.");
3035       }
3036       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3037       mubuf->operands[0] = vaddr;
3038       mubuf->operands[1] = Operand(rsrc);
3039       mubuf->operands[2] = soffset;
3040       mubuf->offen = (offset.type() == RegType::vgpr);
3041       mubuf->glc = glc;
3042       mubuf->dlc = dlc;
3043       mubuf->barrier = barrier_buffer;
3044       mubuf->offset = const_offset;
3045       aco_ptr<Instruction> instr = std::move(mubuf);
3046
3047       if (dst.size() > 4) {
3048          assert(lower != Temp());
3049          Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3050          instr->definitions[0] = Definition(upper);
3051          bld.insert(std::move(instr));
3052          if (dst.size() == 8)
3053             emit_split_vector(ctx, upper, 2);
3054          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3055          instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3056          instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3057          instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3058          if (dst.size() == 8)
3059             instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3060       }
3061
3062       if (dst.type() == RegType::sgpr) {
3063          Temp vec = bld.tmp(RegType::vgpr, dst.size());
3064          instr->definitions[0] = Definition(vec);
3065          bld.insert(std::move(instr));
3066          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3067       } else {
3068          instr->definitions[0] = Definition(dst);
3069          bld.insert(std::move(instr));
3070       }
3071    } else {
3072       switch (num_bytes) {
3073          case 4:
3074             op = aco_opcode::s_buffer_load_dword;
3075             break;
3076          case 8:
3077             op = aco_opcode::s_buffer_load_dwordx2;
3078             break;
3079          case 12:
3080          case 16:
3081             op = aco_opcode::s_buffer_load_dwordx4;
3082             break;
3083          case 24:
3084          case 32:
3085             op = aco_opcode::s_buffer_load_dwordx8;
3086             break;
3087          default:
3088             unreachable("Load SSBO not implemented for this size.");
3089       }
3090       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3091       load->operands[0] = Operand(rsrc);
3092       load->operands[1] = Operand(bld.as_uniform(offset));
3093       assert(load->operands[1].getTemp().type() == RegType::sgpr);
3094       load->definitions[0] = Definition(dst);
3095       load->glc = glc;
3096       load->dlc = dlc;
3097       load->barrier = barrier_buffer;
3098       assert(ctx->options->chip_class >= GFX8 || !glc);
3099
3100       /* trim vector */
3101       if (dst.size() == 3) {
3102          Temp vec = bld.tmp(s4);
3103          load->definitions[0] = Definition(vec);
3104          bld.insert(std::move(load));
3105          emit_split_vector(ctx, vec, 4);
3106
3107          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3108                     emit_extract_vector(ctx, vec, 0, s1),
3109                     emit_extract_vector(ctx, vec, 1, s1),
3110                     emit_extract_vector(ctx, vec, 2, s1));
3111       } else if (dst.size() == 6) {
3112          Temp vec = bld.tmp(s8);
3113          load->definitions[0] = Definition(vec);
3114          bld.insert(std::move(load));
3115          emit_split_vector(ctx, vec, 4);
3116
3117          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3118                     emit_extract_vector(ctx, vec, 0, s2),
3119                     emit_extract_vector(ctx, vec, 1, s2),
3120                     emit_extract_vector(ctx, vec, 2, s2));
3121       } else {
3122          bld.insert(std::move(load));
3123       }
3124
3125    }
3126    emit_split_vector(ctx, dst, num_components);
3127 }
3128
3129 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3130 {
3131    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3132    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3133
3134    Builder bld(ctx->program, ctx->block);
3135
3136    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3137    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3138    unsigned binding = nir_intrinsic_binding(idx_instr);
3139    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3140
3141    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3142       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3143                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3144                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3145                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3146       if (ctx->options->chip_class >= GFX10) {
3147          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3148                       S_008F0C_OOB_SELECT(3) |
3149                       S_008F0C_RESOURCE_LEVEL(1);
3150       } else {
3151          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3152                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3153       }
3154       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3155                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3156                                      Operand(0xFFFFFFFFu),
3157                                      Operand(desc_type));
3158       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3159                         rsrc, upper_dwords);
3160    } else {
3161       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3162       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3163    }
3164
3165    load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3166 }
3167
3168 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3169 {
3170    Builder bld(ctx->program, ctx->block);
3171    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3172
3173    unsigned offset = nir_intrinsic_base(instr);
3174    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3175    if (index_cv && instr->dest.ssa.bit_size == 32) {
3176
3177       unsigned count = instr->dest.ssa.num_components;
3178       unsigned start = (offset + index_cv->u32) / 4u;
3179       start -= ctx->base_inline_push_consts;
3180       if (start + count <= ctx->num_inline_push_consts) {
3181          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3182          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3183          for (unsigned i = 0; i < count; ++i) {
3184             elems[i] = ctx->inline_push_consts[start + i];
3185             vec->operands[i] = Operand{elems[i]};
3186          }
3187          vec->definitions[0] = Definition(dst);
3188          ctx->block->instructions.emplace_back(std::move(vec));
3189          ctx->allocated_vec.emplace(dst.id(), elems);
3190          return;
3191       }
3192    }
3193
3194    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3195    if (offset != 0) // TODO check if index != 0 as well
3196       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3197    Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
3198    Temp vec = dst;
3199    bool trim = false;
3200    aco_opcode op;
3201
3202    switch (dst.size()) {
3203    case 1:
3204       op = aco_opcode::s_load_dword;
3205       break;
3206    case 2:
3207       op = aco_opcode::s_load_dwordx2;
3208       break;
3209    case 3:
3210       vec = bld.tmp(s4);
3211       trim = true;
3212    case 4:
3213       op = aco_opcode::s_load_dwordx4;
3214       break;
3215    case 6:
3216       vec = bld.tmp(s8);
3217       trim = true;
3218    case 8:
3219       op = aco_opcode::s_load_dwordx8;
3220       break;
3221    default:
3222       unreachable("unimplemented or forbidden load_push_constant.");
3223    }
3224
3225    bld.smem(op, Definition(vec), ptr, index);
3226
3227    if (trim) {
3228       emit_split_vector(ctx, vec, 4);
3229       RegClass rc = dst.size() == 3 ? s1 : s2;
3230       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3231                  emit_extract_vector(ctx, vec, 0, rc),
3232                  emit_extract_vector(ctx, vec, 1, rc),
3233                  emit_extract_vector(ctx, vec, 2, rc));
3234
3235    }
3236    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3237 }
3238
3239 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3240 {
3241    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3242
3243    Builder bld(ctx->program, ctx->block);
3244
3245    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3246                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3247                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3248                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3249    if (ctx->options->chip_class >= GFX10) {
3250       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3251                    S_008F0C_OOB_SELECT(3) |
3252                    S_008F0C_RESOURCE_LEVEL(1);
3253    } else {
3254       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3255                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3256    }
3257
3258    unsigned base = nir_intrinsic_base(instr);
3259    unsigned range = nir_intrinsic_range(instr);
3260
3261    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3262    if (base && offset.type() == RegType::sgpr)
3263       offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3264    else if (base && offset.type() == RegType::vgpr)
3265       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3266
3267    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3268                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3269                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3270                           Operand(desc_type));
3271
3272    load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3273 }
3274
3275 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3276 {
3277    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3278       ctx->cf_info.exec_potentially_empty = true;
3279
3280    ctx->program->needs_exact = true;
3281
3282    // TODO: optimize uniform conditions
3283    Builder bld(ctx->program, ctx->block);
3284    Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
3285    src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3286    bld.pseudo(aco_opcode::p_discard_if, src);
3287    ctx->block->kind |= block_kind_uses_discard_if;
3288    return;
3289 }
3290
3291 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3292 {
3293    Builder bld(ctx->program, ctx->block);
3294
3295    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3296       ctx->cf_info.exec_potentially_empty = true;
3297
3298    bool divergent = ctx->cf_info.parent_if.is_divergent ||
3299                     ctx->cf_info.parent_loop.has_divergent_continue;
3300
3301    if (ctx->block->loop_nest_depth &&
3302        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3303       /* we handle discards the same way as jump instructions */
3304       append_logical_end(ctx->block);
3305
3306       /* in loops, discard behaves like break */
3307       Block *linear_target = ctx->cf_info.parent_loop.exit;
3308       ctx->block->kind |= block_kind_discard;
3309
3310       if (!divergent) {
3311          /* uniform discard - loop ends here */
3312          assert(nir_instr_is_last(&instr->instr));
3313          ctx->block->kind |= block_kind_uniform;
3314          ctx->cf_info.has_branch = true;
3315          bld.branch(aco_opcode::p_branch);
3316          add_linear_edge(ctx->block->index, linear_target);
3317          return;
3318       }
3319
3320       /* we add a break right behind the discard() instructions */
3321       ctx->block->kind |= block_kind_break;
3322       unsigned idx = ctx->block->index;
3323
3324       /* remove critical edges from linear CFG */
3325       bld.branch(aco_opcode::p_branch);
3326       Block* break_block = ctx->program->create_and_insert_block();
3327       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3328       break_block->kind |= block_kind_uniform;
3329       add_linear_edge(idx, break_block);
3330       add_linear_edge(break_block->index, linear_target);
3331       bld.reset(break_block);
3332       bld.branch(aco_opcode::p_branch);
3333
3334       Block* continue_block = ctx->program->create_and_insert_block();
3335       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3336       add_linear_edge(idx, continue_block);
3337       append_logical_start(continue_block);
3338       ctx->block = continue_block;
3339
3340       return;
3341    }
3342
3343    /* it can currently happen that NIR doesn't remove the unreachable code */
3344    if (!nir_instr_is_last(&instr->instr)) {
3345       ctx->program->needs_exact = true;
3346       /* save exec somewhere temporarily so that it doesn't get
3347        * overwritten before the discard from outer exec masks */
3348       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3349       bld.pseudo(aco_opcode::p_discard_if, cond);
3350       ctx->block->kind |= block_kind_uses_discard_if;
3351       return;
3352    }
3353
3354    /* This condition is incorrect for uniformly branched discards in a loop
3355     * predicated by a divergent condition, but the above code catches that case
3356     * and the discard would end up turning into a discard_if.
3357     * For example:
3358     * if (divergent) {
3359     *    while (...) {
3360     *       if (uniform) {
3361     *          discard;
3362     *       }
3363     *    }
3364     * }
3365     */
3366    if (!ctx->cf_info.parent_if.is_divergent) {
3367       /* program just ends here */
3368       ctx->block->kind |= block_kind_uniform;
3369       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3370               0 /* enabled mask */, 9 /* dest */,
3371               false /* compressed */, true/* done */, true /* valid mask */);
3372       bld.sopp(aco_opcode::s_endpgm);
3373       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3374    } else {
3375       ctx->block->kind |= block_kind_discard;
3376       /* branch and linear edge is added by visit_if() */
3377    }
3378 }
3379
3380 enum aco_descriptor_type {
3381    ACO_DESC_IMAGE,
3382    ACO_DESC_FMASK,
3383    ACO_DESC_SAMPLER,
3384    ACO_DESC_BUFFER,
3385    ACO_DESC_PLANE_0,
3386    ACO_DESC_PLANE_1,
3387    ACO_DESC_PLANE_2,
3388 };
3389
3390 static bool
3391 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3392    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3393       return false;
3394    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3395    return dim == ac_image_cube ||
3396           dim == ac_image_1darray ||
3397           dim == ac_image_2darray ||
3398           dim == ac_image_2darraymsaa;
3399 }
3400
3401 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3402                       enum aco_descriptor_type desc_type,
3403                       const nir_tex_instr *tex_instr, bool image, bool write)
3404 {
3405 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3406    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3407    if (it != ctx->tex_desc.end())
3408       return it->second;
3409 */
3410    Temp index = Temp();
3411    bool index_set = false;
3412    unsigned constant_index = 0;
3413    unsigned descriptor_set;
3414    unsigned base_index;
3415    Builder bld(ctx->program, ctx->block);
3416
3417    if (!deref_instr) {
3418       assert(tex_instr && !image);
3419       descriptor_set = 0;
3420       base_index = tex_instr->sampler_index;
3421    } else {
3422       while(deref_instr->deref_type != nir_deref_type_var) {
3423          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3424          if (!array_size)
3425             array_size = 1;
3426
3427          assert(deref_instr->deref_type == nir_deref_type_array);
3428          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3429          if (const_value) {
3430             constant_index += array_size * const_value->u32;
3431          } else {
3432             Temp indirect = bld.as_uniform(get_ssa_temp(ctx, deref_instr->arr.index.ssa));
3433
3434             if (array_size != 1)
3435                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3436
3437             if (!index_set) {
3438                index = indirect;
3439                index_set = true;
3440             } else {
3441                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3442             }
3443          }
3444
3445          deref_instr = nir_src_as_deref(deref_instr->parent);
3446       }
3447       descriptor_set = deref_instr->var->data.descriptor_set;
3448       base_index = deref_instr->var->data.binding;
3449    }
3450
3451    Temp list = load_desc_ptr(ctx, descriptor_set);
3452    list = convert_pointer_to_64_bit(ctx, list);
3453
3454    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3455    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3456    unsigned offset = binding->offset;
3457    unsigned stride = binding->size;
3458    aco_opcode opcode;
3459    RegClass type;
3460
3461    assert(base_index < layout->binding_count);
3462
3463    switch (desc_type) {
3464    case ACO_DESC_IMAGE:
3465       type = s8;
3466       opcode = aco_opcode::s_load_dwordx8;
3467       break;
3468    case ACO_DESC_FMASK:
3469       type = s8;
3470       opcode = aco_opcode::s_load_dwordx8;
3471       offset += 32;
3472       break;
3473    case ACO_DESC_SAMPLER:
3474       type = s4;
3475       opcode = aco_opcode::s_load_dwordx4;
3476       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3477          offset += radv_combined_image_descriptor_sampler_offset(binding);
3478       break;
3479    case ACO_DESC_BUFFER:
3480       type = s4;
3481       opcode = aco_opcode::s_load_dwordx4;
3482       break;
3483    case ACO_DESC_PLANE_0:
3484    case ACO_DESC_PLANE_1:
3485       type = s8;
3486       opcode = aco_opcode::s_load_dwordx8;
3487       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3488       break;
3489    case ACO_DESC_PLANE_2:
3490       type = s4;
3491       opcode = aco_opcode::s_load_dwordx4;
3492       offset += 64;
3493       break;
3494    default:
3495       unreachable("invalid desc_type\n");
3496    }
3497
3498    offset += constant_index * stride;
3499
3500    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3501       (!index_set || binding->immutable_samplers_equal)) {
3502       if (binding->immutable_samplers_equal)
3503          constant_index = 0;
3504
3505       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3506       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3507                         Operand(samplers[constant_index * 4 + 0]),
3508                         Operand(samplers[constant_index * 4 + 1]),
3509                         Operand(samplers[constant_index * 4 + 2]),
3510                         Operand(samplers[constant_index * 4 + 3]));
3511    }
3512
3513    Operand off;
3514    if (!index_set) {
3515       off = Operand(offset);
3516    } else {
3517       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3518                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3519    }
3520
3521    Temp res = bld.smem(opcode, bld.def(type), list, off);
3522
3523    if (desc_type == ACO_DESC_PLANE_2) {
3524       Temp components[8];
3525       for (unsigned i = 0; i < 8; i++)
3526          components[i] = bld.tmp(s1);
3527       bld.pseudo(aco_opcode::p_split_vector,
3528                  Definition(components[0]),
3529                  Definition(components[1]),
3530                  Definition(components[2]),
3531                  Definition(components[3]),
3532                  res);
3533
3534       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3535       bld.pseudo(aco_opcode::p_split_vector,
3536                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3537                  Definition(components[4]),
3538                  Definition(components[5]),
3539                  Definition(components[6]),
3540                  Definition(components[7]),
3541                  desc2);
3542
3543       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3544                        components[0], components[1], components[2], components[3],
3545                        components[4], components[5], components[6], components[7]);
3546    }
3547
3548    return res;
3549 }
3550
3551 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3552 {
3553    switch (dim) {
3554    case GLSL_SAMPLER_DIM_BUF:
3555       return 1;
3556    case GLSL_SAMPLER_DIM_1D:
3557       return array ? 2 : 1;
3558    case GLSL_SAMPLER_DIM_2D:
3559       return array ? 3 : 2;
3560    case GLSL_SAMPLER_DIM_MS:
3561       return array ? 4 : 3;
3562    case GLSL_SAMPLER_DIM_3D:
3563    case GLSL_SAMPLER_DIM_CUBE:
3564       return 3;
3565    case GLSL_SAMPLER_DIM_RECT:
3566    case GLSL_SAMPLER_DIM_SUBPASS:
3567       return 2;
3568    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3569       return 3;
3570    default:
3571       break;
3572    }
3573    return 0;
3574 }
3575
3576
3577 /* Adjust the sample index according to FMASK.
3578  *
3579  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3580  * which is the identity mapping. Each nibble says which physical sample
3581  * should be fetched to get that sample.
3582  *
3583  * For example, 0x11111100 means there are only 2 samples stored and
3584  * the second sample covers 3/4 of the pixel. When reading samples 0
3585  * and 1, return physical sample 0 (determined by the first two 0s
3586  * in FMASK), otherwise return physical sample 1.
3587  *
3588  * The sample index should be adjusted as follows:
3589  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
3590  */
3591 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3592 {
3593    Builder bld(ctx->program, ctx->block);
3594    Temp fmask = bld.tmp(v1);
3595    unsigned dim = ctx->options->chip_class >= GFX10
3596                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3597                   : 0;
3598
3599    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3600    load->operands[0] = Operand(coords);
3601    load->operands[1] = Operand(fmask_desc_ptr);
3602    load->definitions[0] = Definition(fmask);
3603    load->glc = false;
3604    load->dlc = false;
3605    load->dmask = 0x1;
3606    load->unrm = true;
3607    load->da = da;
3608    load->dim = dim;
3609    load->can_reorder = true; /* fmask images shouldn't be modified */
3610    ctx->block->instructions.emplace_back(std::move(load));
3611
3612    Operand sample_index4;
3613    if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3614       sample_index4 = Operand(sample_index.constantValue() << 2);
3615    } else if (sample_index.regClass() == s1) {
3616       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3617    } else {
3618       assert(sample_index.regClass() == v1);
3619       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3620    }
3621
3622    Temp final_sample;
3623    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3624       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3625    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3626       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3627    else
3628       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3629
3630    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3631     * resource descriptor is 0 (invalid),
3632     */
3633    Temp compare = bld.tmp(s2);
3634    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3635                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3636
3637    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3638
3639    /* Replace the MSAA sample index. */
3640    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3641 }
3642
3643 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3644 {
3645
3646    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3647    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3648    bool is_array = glsl_sampler_type_is_array(type);
3649    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3650    assert(!add_frag_pos && "Input attachments should be lowered.");
3651    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3652    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3653    int count = image_type_to_components_count(dim, is_array);
3654    std::vector<Operand> coords(count);
3655
3656    if (is_ms) {
3657       Operand sample_index;
3658       nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3659       if (sample_cv)
3660          sample_index = Operand(sample_cv->u32);
3661       else
3662          sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3663
3664       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3665          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3666          for (unsigned i = 0; i < vec->operands.size(); i++)
3667             vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3668          Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3669          vec->definitions[0] = Definition(fmask_load_address);
3670          ctx->block->instructions.emplace_back(std::move(vec));
3671
3672          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3673          sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3674       }
3675       count--;
3676       coords[count] = sample_index;
3677    }
3678
3679    if (count == 1 && !gfx9_1d)
3680       return emit_extract_vector(ctx, src0, 0, v1);
3681
3682    if (gfx9_1d) {
3683       coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3684       coords.resize(coords.size() + 1);
3685       coords[1] = Operand((uint32_t) 0);
3686       if (is_array)
3687          coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3688    } else {
3689       for (int i = 0; i < count; i++)
3690          coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3691    }
3692
3693    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
3694    for (unsigned i = 0; i < coords.size(); i++)
3695       vec->operands[i] = coords[i];
3696    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
3697    vec->definitions[0] = Definition(res);
3698    ctx->block->instructions.emplace_back(std::move(vec));
3699    return res;
3700 }
3701
3702
3703 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
3704 {
3705    Builder bld(ctx->program, ctx->block);
3706    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3707    const struct glsl_type *type = glsl_without_array(var->type);
3708    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3709    bool is_array = glsl_sampler_type_is_array(type);
3710    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3711
3712    if (dim == GLSL_SAMPLER_DIM_BUF) {
3713       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
3714       unsigned num_channels = util_last_bit(mask);
3715       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3716       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3717
3718       aco_opcode opcode;
3719       switch (num_channels) {
3720       case 1:
3721          opcode = aco_opcode::buffer_load_format_x;
3722          break;
3723       case 2:
3724          opcode = aco_opcode::buffer_load_format_xy;
3725          break;
3726       case 3:
3727          opcode = aco_opcode::buffer_load_format_xyz;
3728          break;
3729       case 4:
3730          opcode = aco_opcode::buffer_load_format_xyzw;
3731          break;
3732       default:
3733          unreachable(">4 channel buffer image load");
3734       }
3735       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
3736       load->operands[0] = Operand(vindex);
3737       load->operands[1] = Operand(rsrc);
3738       load->operands[2] = Operand((uint32_t) 0);
3739       Temp tmp;
3740       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3741          tmp = dst;
3742       else
3743          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
3744       load->definitions[0] = Definition(tmp);
3745       load->idxen = true;
3746       load->barrier = barrier_image;
3747       ctx->block->instructions.emplace_back(std::move(load));
3748
3749       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
3750       return;
3751    }
3752
3753    Temp coords = get_image_coords(ctx, instr, type);
3754    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3755
3756    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
3757    unsigned num_components = util_bitcount(dmask);
3758    Temp tmp;
3759    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3760       tmp = dst;
3761    else
3762       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
3763
3764    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3765    load->operands[0] = Operand(coords);
3766    load->operands[1] = Operand(resource);
3767    load->definitions[0] = Definition(tmp);
3768    load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
3769    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3770    load->dmask = dmask;
3771    load->unrm = true;
3772    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3773    load->barrier = barrier_image;
3774    ctx->block->instructions.emplace_back(std::move(load));
3775
3776    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
3777    return;
3778 }
3779
3780 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
3781 {
3782    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3783    const struct glsl_type *type = glsl_without_array(var->type);
3784    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3785    bool is_array = glsl_sampler_type_is_array(type);
3786    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3787
3788    bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
3789
3790    if (dim == GLSL_SAMPLER_DIM_BUF) {
3791       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3792       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3793       aco_opcode opcode;
3794       switch (data.size()) {
3795       case 1:
3796          opcode = aco_opcode::buffer_store_format_x;
3797          break;
3798       case 2:
3799          opcode = aco_opcode::buffer_store_format_xy;
3800          break;
3801       case 3:
3802          opcode = aco_opcode::buffer_store_format_xyz;
3803          break;
3804       case 4:
3805          opcode = aco_opcode::buffer_store_format_xyzw;
3806          break;
3807       default:
3808          unreachable(">4 channel buffer image store");
3809       }
3810       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
3811       store->operands[0] = Operand(vindex);
3812       store->operands[1] = Operand(rsrc);
3813       store->operands[2] = Operand((uint32_t) 0);
3814       store->operands[3] = Operand(data);
3815       store->idxen = true;
3816       store->glc = glc;
3817       store->dlc = false;
3818       store->disable_wqm = true;
3819       store->barrier = barrier_image;
3820       ctx->program->needs_exact = true;
3821       ctx->block->instructions.emplace_back(std::move(store));
3822       return;
3823    }
3824
3825    assert(data.type() == RegType::vgpr);
3826    Temp coords = get_image_coords(ctx, instr, type);
3827    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3828
3829    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
3830    store->operands[0] = Operand(coords);
3831    store->operands[1] = Operand(resource);
3832    store->operands[2] = Operand(s4);
3833    store->operands[3] = Operand(data);
3834    store->glc = glc;
3835    store->dlc = false;
3836    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3837    store->dmask = (1 << data.size()) - 1;
3838    store->unrm = true;
3839    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3840    store->disable_wqm = true;
3841    store->barrier = barrier_image;
3842    ctx->program->needs_exact = true;
3843    ctx->block->instructions.emplace_back(std::move(store));
3844    return;
3845 }
3846
3847 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
3848 {
3849    /* return the previous value if dest is ever used */
3850    bool return_previous = false;
3851    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
3852       return_previous = true;
3853       break;
3854    }
3855    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
3856       return_previous = true;
3857       break;
3858    }
3859
3860    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3861    const struct glsl_type *type = glsl_without_array(var->type);
3862    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3863    bool is_array = glsl_sampler_type_is_array(type);
3864    Builder bld(ctx->program, ctx->block);
3865
3866    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3867    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
3868
3869    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
3870       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
3871
3872    aco_opcode buf_op, image_op;
3873    switch (instr->intrinsic) {
3874       case nir_intrinsic_image_deref_atomic_add:
3875          buf_op = aco_opcode::buffer_atomic_add;
3876          image_op = aco_opcode::image_atomic_add;
3877          break;
3878       case nir_intrinsic_image_deref_atomic_umin:
3879          buf_op = aco_opcode::buffer_atomic_umin;
3880          image_op = aco_opcode::image_atomic_umin;
3881          break;
3882       case nir_intrinsic_image_deref_atomic_imin:
3883          buf_op = aco_opcode::buffer_atomic_smin;
3884          image_op = aco_opcode::image_atomic_smin;
3885          break;
3886       case nir_intrinsic_image_deref_atomic_umax:
3887          buf_op = aco_opcode::buffer_atomic_umax;
3888          image_op = aco_opcode::image_atomic_umax;
3889          break;
3890       case nir_intrinsic_image_deref_atomic_imax:
3891          buf_op = aco_opcode::buffer_atomic_smax;
3892          image_op = aco_opcode::image_atomic_smax;
3893          break;
3894       case nir_intrinsic_image_deref_atomic_and:
3895          buf_op = aco_opcode::buffer_atomic_and;
3896          image_op = aco_opcode::image_atomic_and;
3897          break;
3898       case nir_intrinsic_image_deref_atomic_or:
3899          buf_op = aco_opcode::buffer_atomic_or;
3900          image_op = aco_opcode::image_atomic_or;
3901          break;
3902       case nir_intrinsic_image_deref_atomic_xor:
3903          buf_op = aco_opcode::buffer_atomic_xor;
3904          image_op = aco_opcode::image_atomic_xor;
3905          break;
3906       case nir_intrinsic_image_deref_atomic_exchange:
3907          buf_op = aco_opcode::buffer_atomic_swap;
3908          image_op = aco_opcode::image_atomic_swap;
3909          break;
3910       case nir_intrinsic_image_deref_atomic_comp_swap:
3911          buf_op = aco_opcode::buffer_atomic_cmpswap;
3912          image_op = aco_opcode::image_atomic_cmpswap;
3913          break;
3914       default:
3915          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
3916    }
3917
3918    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3919
3920    if (dim == GLSL_SAMPLER_DIM_BUF) {
3921       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3922       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3923       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
3924       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
3925       mubuf->operands[0] = Operand(vindex);
3926       mubuf->operands[1] = Operand(resource);
3927       mubuf->operands[2] = Operand((uint32_t)0);
3928       mubuf->operands[3] = Operand(data);
3929       if (return_previous)
3930          mubuf->definitions[0] = Definition(dst);
3931       mubuf->offset = 0;
3932       mubuf->idxen = true;
3933       mubuf->glc = return_previous;
3934       mubuf->dlc = false; /* Not needed for atomics */
3935       mubuf->disable_wqm = true;
3936       mubuf->barrier = barrier_image;
3937       ctx->program->needs_exact = true;
3938       ctx->block->instructions.emplace_back(std::move(mubuf));
3939       return;
3940    }
3941
3942    Temp coords = get_image_coords(ctx, instr, type);
3943    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3944    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
3945    mimg->operands[0] = Operand(coords);
3946    mimg->operands[1] = Operand(resource);
3947    mimg->operands[2] = Operand(s4); /* no sampler */
3948    mimg->operands[3] = Operand(data);
3949    if (return_previous)
3950       mimg->definitions[0] = Definition(dst);
3951    mimg->glc = return_previous;
3952    mimg->dlc = false; /* Not needed for atomics */
3953    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3954    mimg->dmask = (1 << data.size()) - 1;
3955    mimg->unrm = true;
3956    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3957    mimg->disable_wqm = true;
3958    mimg->barrier = barrier_image;
3959    ctx->program->needs_exact = true;
3960    ctx->block->instructions.emplace_back(std::move(mimg));
3961    return;
3962 }
3963
3964 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
3965 {
3966    if (in_elements && ctx->options->chip_class == GFX8) {
3967       Builder bld(ctx->program, ctx->block);
3968
3969       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
3970       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
3971       stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
3972       stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
3973
3974       Temp size = emit_extract_vector(ctx, desc, 2, s1);
3975       size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
3976
3977       Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
3978       res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
3979       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
3980
3981       // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
3982       /* idea
3983        * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
3984        * in case 12 (or 3?), we have to divide by 3:
3985        * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
3986        * use v_mul_hi_u32 with magic number to divide
3987        * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
3988        * disable v_skip
3989        * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
3990        */
3991
3992    } else {
3993       emit_extract_vector(ctx, desc, 2, dst);
3994    }
3995 }
3996
3997 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
3998 {
3999    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4000    const struct glsl_type *type = glsl_without_array(var->type);
4001    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4002    bool is_array = glsl_sampler_type_is_array(type);
4003    Builder bld(ctx->program, ctx->block);
4004
4005    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4006       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4007       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4008    }
4009
4010    /* LOD */
4011    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4012
4013    /* Resource */
4014    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4015
4016    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4017
4018    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4019    mimg->operands[0] = Operand(lod);
4020    mimg->operands[1] = Operand(resource);
4021    unsigned& dmask = mimg->dmask;
4022    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4023    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4024    mimg->da = glsl_sampler_type_is_array(type);
4025    mimg->can_reorder = true;
4026    Definition& def = mimg->definitions[0];
4027    ctx->block->instructions.emplace_back(std::move(mimg));
4028
4029    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4030        glsl_sampler_type_is_array(type)) {
4031
4032       assert(instr->dest.ssa.num_components == 3);
4033       Temp tmp = {ctx->program->allocateId(), v3};
4034       def = Definition(tmp);
4035       emit_split_vector(ctx, tmp, 3);
4036
4037       /* divide 3rd value by 6 by multiplying with magic number */
4038       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4039       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4040
4041       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4042                  emit_extract_vector(ctx, tmp, 0, v1),
4043                  emit_extract_vector(ctx, tmp, 1, v1),
4044                  by_6);
4045
4046    } else if (ctx->options->chip_class == GFX9 &&
4047               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4048               glsl_sampler_type_is_array(type)) {
4049       assert(instr->dest.ssa.num_components == 2);
4050       def = Definition(dst);
4051       dmask = 0x5;
4052    } else {
4053       def = Definition(dst);
4054    }
4055
4056    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4057 }
4058
4059 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4060 {
4061    Builder bld(ctx->program, ctx->block);
4062    unsigned num_components = instr->num_components;
4063
4064    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4065    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4066    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4067
4068    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4069    load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc);
4070 }
4071
4072 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4073 {
4074    Builder bld(ctx->program, ctx->block);
4075    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4076    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4077    unsigned writemask = nir_intrinsic_write_mask(instr);
4078
4079    Temp offset;
4080    if (ctx->options->chip_class < GFX8)
4081       offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4082    else
4083       offset = get_ssa_temp(ctx, instr->src[2].ssa);
4084
4085    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4086    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4087
4088    bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4089                ctx->options->chip_class >= GFX8;
4090    if (smem)
4091       offset = bld.as_uniform(offset);
4092    bool smem_nonfs = smem && ctx->stage != fragment_fs;
4093
4094    while (writemask) {
4095       int start, count;
4096       u_bit_scan_consecutive_range(&writemask, &start, &count);
4097       if (count == 3 && smem) {
4098          writemask |= 1u << (start + 2);
4099          count = 2;
4100       }
4101       int num_bytes = count * elem_size_bytes;
4102
4103       if (num_bytes > 16) {
4104          assert(elem_size_bytes == 8);
4105          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4106          count = 2;
4107          num_bytes = 16;
4108       }
4109
4110       // TODO: check alignment of sub-dword stores
4111       // TODO: split 3 bytes. there is no store instruction for that
4112
4113       Temp write_data;
4114       if (count != instr->num_components) {
4115          emit_split_vector(ctx, data, instr->num_components);
4116          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4117          for (int i = 0; i < count; i++) {
4118             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4119             vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4120          }
4121          write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4122          vec->definitions[0] = Definition(write_data);
4123          ctx->block->instructions.emplace_back(std::move(vec));
4124       } else if (!smem && data.type() != RegType::vgpr) {
4125          assert(num_bytes % 4 == 0);
4126          write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4127       } else if (smem_nonfs && data.type() == RegType::vgpr) {
4128          assert(num_bytes % 4 == 0);
4129          write_data = bld.as_uniform(data);
4130       } else {
4131          write_data = data;
4132       }
4133
4134       aco_opcode vmem_op, smem_op;
4135       switch (num_bytes) {
4136          case 4:
4137             vmem_op = aco_opcode::buffer_store_dword;
4138             smem_op = aco_opcode::s_buffer_store_dword;
4139             break;
4140          case 8:
4141             vmem_op = aco_opcode::buffer_store_dwordx2;
4142             smem_op = aco_opcode::s_buffer_store_dwordx2;
4143             break;
4144          case 12:
4145             vmem_op = aco_opcode::buffer_store_dwordx3;
4146             smem_op = aco_opcode::last_opcode;
4147             assert(!smem);
4148             break;
4149          case 16:
4150             vmem_op = aco_opcode::buffer_store_dwordx4;
4151             smem_op = aco_opcode::s_buffer_store_dwordx4;
4152             break;
4153          default:
4154             unreachable("Store SSBO not implemented for this size.");
4155       }
4156       if (ctx->stage == fragment_fs)
4157          smem_op = aco_opcode::p_fs_buffer_store_smem;
4158
4159       if (smem) {
4160          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4161          store->operands[0] = Operand(rsrc);
4162          if (start) {
4163             Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4164                                 offset, Operand(start * elem_size_bytes));
4165             store->operands[1] = Operand(off);
4166          } else {
4167             store->operands[1] = Operand(offset);
4168          }
4169          if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4170             store->operands[1].setFixed(m0);
4171          store->operands[2] = Operand(write_data);
4172          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4173          store->dlc = false;
4174          store->disable_wqm = true;
4175          store->barrier = barrier_buffer;
4176          ctx->block->instructions.emplace_back(std::move(store));
4177          ctx->program->wb_smem_l1_on_end = true;
4178          if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4179             ctx->block->kind |= block_kind_needs_lowering;
4180             ctx->program->needs_exact = true;
4181          }
4182       } else {
4183          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4184          store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4185          store->operands[1] = Operand(rsrc);
4186          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4187          store->operands[3] = Operand(write_data);
4188          store->offset = start * elem_size_bytes;
4189          store->offen = (offset.type() == RegType::vgpr);
4190          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4191          store->dlc = false;
4192          store->disable_wqm = true;
4193          store->barrier = barrier_buffer;
4194          ctx->program->needs_exact = true;
4195          ctx->block->instructions.emplace_back(std::move(store));
4196       }
4197    }
4198 }
4199
4200 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4201 {
4202    /* return the previous value if dest is ever used */
4203    bool return_previous = false;
4204    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4205       return_previous = true;
4206       break;
4207    }
4208    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4209       return_previous = true;
4210       break;
4211    }
4212
4213    Builder bld(ctx->program, ctx->block);
4214    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4215
4216    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4217       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4218                         get_ssa_temp(ctx, instr->src[3].ssa), data);
4219
4220    Temp offset;
4221    if (ctx->options->chip_class < GFX8)
4222       offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4223    else
4224       offset = get_ssa_temp(ctx, instr->src[1].ssa);
4225
4226    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4227    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4228
4229    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4230
4231    aco_opcode op32, op64;
4232    switch (instr->intrinsic) {
4233       case nir_intrinsic_ssbo_atomic_add:
4234          op32 = aco_opcode::buffer_atomic_add;
4235          op64 = aco_opcode::buffer_atomic_add_x2;
4236          break;
4237       case nir_intrinsic_ssbo_atomic_imin:
4238          op32 = aco_opcode::buffer_atomic_smin;
4239          op64 = aco_opcode::buffer_atomic_smin_x2;
4240          break;
4241       case nir_intrinsic_ssbo_atomic_umin:
4242          op32 = aco_opcode::buffer_atomic_umin;
4243          op64 = aco_opcode::buffer_atomic_umin_x2;
4244          break;
4245       case nir_intrinsic_ssbo_atomic_imax:
4246          op32 = aco_opcode::buffer_atomic_smax;
4247          op64 = aco_opcode::buffer_atomic_smax_x2;
4248          break;
4249       case nir_intrinsic_ssbo_atomic_umax:
4250          op32 = aco_opcode::buffer_atomic_umax;
4251          op64 = aco_opcode::buffer_atomic_umax_x2;
4252          break;
4253       case nir_intrinsic_ssbo_atomic_and:
4254          op32 = aco_opcode::buffer_atomic_and;
4255          op64 = aco_opcode::buffer_atomic_and_x2;
4256          break;
4257       case nir_intrinsic_ssbo_atomic_or:
4258          op32 = aco_opcode::buffer_atomic_or;
4259          op64 = aco_opcode::buffer_atomic_or_x2;
4260          break;
4261       case nir_intrinsic_ssbo_atomic_xor:
4262          op32 = aco_opcode::buffer_atomic_xor;
4263          op64 = aco_opcode::buffer_atomic_xor_x2;
4264          break;
4265       case nir_intrinsic_ssbo_atomic_exchange:
4266          op32 = aco_opcode::buffer_atomic_swap;
4267          op64 = aco_opcode::buffer_atomic_swap_x2;
4268          break;
4269       case nir_intrinsic_ssbo_atomic_comp_swap:
4270          op32 = aco_opcode::buffer_atomic_cmpswap;
4271          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4272          break;
4273       default:
4274          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4275    }
4276    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4277    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4278    mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4279    mubuf->operands[1] = Operand(rsrc);
4280    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4281    mubuf->operands[3] = Operand(data);
4282    if (return_previous)
4283       mubuf->definitions[0] = Definition(dst);
4284    mubuf->offset = 0;
4285    mubuf->offen = (offset.type() == RegType::vgpr);
4286    mubuf->glc = return_previous;
4287    mubuf->dlc = false; /* Not needed for atomics */
4288    mubuf->disable_wqm = true;
4289    mubuf->barrier = barrier_buffer;
4290    ctx->program->needs_exact = true;
4291    ctx->block->instructions.emplace_back(std::move(mubuf));
4292 }
4293
4294 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4295
4296    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4297    Builder bld(ctx->program, ctx->block);
4298    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4299    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4300 }
4301
4302 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4303 {
4304    Builder bld(ctx->program, ctx->block);
4305    unsigned num_components = instr->num_components;
4306    unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4307
4308    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4309    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4310
4311    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4312    bool dlc = glc && ctx->options->chip_class >= GFX10;
4313    aco_opcode op;
4314    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4315       bool global = ctx->options->chip_class >= GFX9;
4316       aco_opcode op;
4317       switch (num_bytes) {
4318       case 4:
4319          op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4320          break;
4321       case 8:
4322          op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4323          break;
4324       case 12:
4325          op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4326          break;
4327       case 16:
4328          op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4329          break;
4330       default:
4331          unreachable("load_global not implemented for this size.");
4332       }
4333       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4334       flat->operands[0] = Operand(addr);
4335       flat->operands[1] = Operand(s1);
4336       flat->glc = glc;
4337       flat->dlc = dlc;
4338
4339       if (dst.type() == RegType::sgpr) {
4340          Temp vec = bld.tmp(RegType::vgpr, dst.size());
4341          flat->definitions[0] = Definition(vec);
4342          ctx->block->instructions.emplace_back(std::move(flat));
4343          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4344       } else {
4345          flat->definitions[0] = Definition(dst);
4346          ctx->block->instructions.emplace_back(std::move(flat));
4347       }
4348       emit_split_vector(ctx, dst, num_components);
4349    } else {
4350       switch (num_bytes) {
4351          case 4:
4352             op = aco_opcode::s_load_dword;
4353             break;
4354          case 8:
4355             op = aco_opcode::s_load_dwordx2;
4356             break;
4357          case 12:
4358          case 16:
4359             op = aco_opcode::s_load_dwordx4;
4360             break;
4361          default:
4362             unreachable("load_global not implemented for this size.");
4363       }
4364       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4365       load->operands[0] = Operand(addr);
4366       load->operands[1] = Operand(0u);
4367       load->definitions[0] = Definition(dst);
4368       load->glc = glc;
4369       load->dlc = dlc;
4370       load->barrier = barrier_buffer;
4371       assert(ctx->options->chip_class >= GFX8 || !glc);
4372
4373       if (dst.size() == 3) {
4374          /* trim vector */
4375          Temp vec = bld.tmp(s4);
4376          load->definitions[0] = Definition(vec);
4377          ctx->block->instructions.emplace_back(std::move(load));
4378          emit_split_vector(ctx, vec, 4);
4379
4380          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4381                     emit_extract_vector(ctx, vec, 0, s1),
4382                     emit_extract_vector(ctx, vec, 1, s1),
4383                     emit_extract_vector(ctx, vec, 2, s1));
4384       } else {
4385          ctx->block->instructions.emplace_back(std::move(load));
4386       }
4387    }
4388 }
4389
4390 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4391 {
4392    Builder bld(ctx->program, ctx->block);
4393    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4394
4395    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4396    Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4397
4398    unsigned writemask = nir_intrinsic_write_mask(instr);
4399    while (writemask) {
4400       int start, count;
4401       u_bit_scan_consecutive_range(&writemask, &start, &count);
4402       unsigned num_bytes = count * elem_size_bytes;
4403
4404       Temp write_data = data;
4405       if (count != instr->num_components) {
4406          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4407          for (int i = 0; i < count; i++)
4408             vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4409          write_data = bld.tmp(RegType::vgpr, count);
4410          vec->definitions[0] = Definition(write_data);
4411          ctx->block->instructions.emplace_back(std::move(vec));
4412       }
4413
4414       unsigned offset = start * elem_size_bytes;
4415       if (offset > 0 && ctx->options->chip_class < GFX9) {
4416          Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4417          Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4418          Temp carry = bld.tmp(s2);
4419          bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4420
4421          bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4422                   Operand(offset), addr0);
4423          bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4424                   Operand(0u), addr1,
4425                   carry).def(1).setHint(vcc);
4426
4427          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4428
4429          offset = 0;
4430       }
4431
4432       bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4433       bool global = ctx->options->chip_class >= GFX9;
4434       aco_opcode op;
4435       switch (num_bytes) {
4436       case 4:
4437          op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4438          break;
4439       case 8:
4440          op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4441          break;
4442       case 12:
4443          op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4444          break;
4445       case 16:
4446          op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4447          break;
4448       default:
4449          unreachable("store_global not implemented for this size.");
4450       }
4451       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4452       flat->operands[0] = Operand(addr);
4453       flat->operands[1] = Operand(s1);
4454       flat->operands[2] = Operand(data);
4455       flat->glc = glc;
4456       flat->dlc = false;
4457       flat->offset = offset;
4458       ctx->block->instructions.emplace_back(std::move(flat));
4459    }
4460 }
4461
4462 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4463    Builder bld(ctx->program, ctx->block);
4464    switch(instr->intrinsic) {
4465       case nir_intrinsic_group_memory_barrier:
4466       case nir_intrinsic_memory_barrier:
4467          bld.barrier(aco_opcode::p_memory_barrier_all);
4468          break;
4469       case nir_intrinsic_memory_barrier_atomic_counter:
4470          bld.barrier(aco_opcode::p_memory_barrier_atomic);
4471          break;
4472       case nir_intrinsic_memory_barrier_buffer:
4473          bld.barrier(aco_opcode::p_memory_barrier_buffer);
4474          break;
4475       case nir_intrinsic_memory_barrier_image:
4476          bld.barrier(aco_opcode::p_memory_barrier_image);
4477          break;
4478       case nir_intrinsic_memory_barrier_shared:
4479          bld.barrier(aco_opcode::p_memory_barrier_shared);
4480          break;
4481       default:
4482          unreachable("Unimplemented memory barrier intrinsic");
4483          break;
4484    }
4485 }
4486
4487 Operand load_lds_size_m0(isel_context *ctx)
4488 {
4489    /* TODO: m0 does not need to be initialized on GFX9+ */
4490    Builder bld(ctx->program, ctx->block);
4491    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
4492 }
4493
4494
4495 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4496 {
4497    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4498    Operand m = load_lds_size_m0(ctx);
4499    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4500    assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4501    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4502    Builder bld(ctx->program, ctx->block);
4503
4504    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4505    unsigned bytes_read = 0;
4506    unsigned result_size = 0;
4507    unsigned total_bytes = instr->num_components * elem_size_bytes;
4508    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : instr->dest.ssa.bit_size / 8;
4509    std::array<Temp, 4> result;
4510
4511    while (bytes_read < total_bytes) {
4512       unsigned todo = total_bytes - bytes_read;
4513       bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
4514       bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
4515
4516       aco_opcode op = aco_opcode::last_opcode;
4517       if (todo >= 16 && aligned16) {
4518          op = aco_opcode::ds_read_b128;
4519          todo = 16;
4520       } else if (todo >= 12 && aligned16) {
4521          op = aco_opcode::ds_read_b96;
4522          todo = 12;
4523       } else if (todo >= 8) {
4524          op = aligned8 ? aco_opcode::ds_read_b64 : aco_opcode::ds_read2_b32;
4525          todo = 8;
4526       } else if (todo >= 4) {
4527          op = aco_opcode::ds_read_b32;
4528          todo = 4;
4529       } else {
4530          assert(false);
4531       }
4532       assert(todo % elem_size_bytes == 0);
4533       unsigned num_elements = todo / elem_size_bytes;
4534       unsigned offset = nir_intrinsic_base(instr) + bytes_read;
4535       unsigned max_offset = op == aco_opcode::ds_read2_b32 ? 1019 : 65535;
4536
4537       Temp address_offset = address;
4538       if (offset > max_offset) {
4539          address_offset = bld.vadd32(bld.def(v1), Operand((uint32_t)nir_intrinsic_base(instr)), address_offset);
4540          offset = bytes_read;
4541       }
4542       assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
4543
4544       Temp res;
4545       if (instr->num_components == 1 && dst.type() == RegType::vgpr)
4546          res = dst;
4547       else
4548          res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
4549
4550       if (op == aco_opcode::ds_read2_b32)
4551          res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
4552       else
4553          res = bld.ds(op, Definition(res), address_offset, m, offset);
4554
4555       if (instr->num_components == 1) {
4556          assert(todo == total_bytes);
4557          if (dst.type() == RegType::sgpr)
4558             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4559          return;
4560       }
4561
4562       if (dst.type() == RegType::sgpr)
4563          res = bld.as_uniform(res);
4564
4565       if (num_elements == 1) {
4566          result[result_size++] = res;
4567       } else {
4568          assert(res != dst && res.size() % num_elements == 0);
4569          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
4570          split->operands[0] = Operand(res);
4571          for (unsigned i = 0; i < num_elements; i++)
4572             split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
4573          ctx->block->instructions.emplace_back(std::move(split));
4574       }
4575
4576       bytes_read += todo;
4577    }
4578
4579    assert(result_size == instr->num_components && result_size > 1);
4580    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
4581    for (unsigned i = 0; i < result_size; i++)
4582       vec->operands[i] = Operand(result[i]);
4583    vec->definitions[0] = Definition(dst);
4584    ctx->block->instructions.emplace_back(std::move(vec));
4585    ctx->allocated_vec.emplace(dst.id(), result);
4586 }
4587
4588 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned offset0, unsigned offset1, unsigned align)
4589 {
4590    Builder bld(ctx->program, ctx->block);
4591    unsigned bytes_written = 0;
4592    while (bytes_written < data.size() * 4) {
4593       unsigned todo = data.size() * 4 - bytes_written;
4594       bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
4595       bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
4596
4597       aco_opcode op = aco_opcode::last_opcode;
4598       unsigned size = 0;
4599       if (todo >= 16 && aligned16) {
4600          op = aco_opcode::ds_write_b128;
4601          size = 4;
4602       } else if (todo >= 12 && aligned16) {
4603          op = aco_opcode::ds_write_b96;
4604          size = 3;
4605       } else if (todo >= 8) {
4606          op = aligned8 ? aco_opcode::ds_write_b64 : aco_opcode::ds_write2_b32;
4607          size = 2;
4608       } else if (todo >= 4) {
4609          op = aco_opcode::ds_write_b32;
4610          size = 1;
4611       } else {
4612          assert(false);
4613       }
4614
4615       bool write2 = op == aco_opcode::ds_write2_b32;
4616       unsigned offset = offset0 + offset1 + bytes_written;
4617       unsigned max_offset = write2 ? 1020 : 65535;
4618       Temp address_offset = address;
4619       if (offset > max_offset) {
4620          address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
4621          offset = offset1 + bytes_written;
4622       }
4623       assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
4624
4625       if (write2) {
4626          Temp val0 = emit_extract_vector(ctx, data, bytes_written >> 2, v1);
4627          Temp val1 = emit_extract_vector(ctx, data, (bytes_written >> 2) + 1, v1);
4628          bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
4629       } else {
4630          Temp val = emit_extract_vector(ctx, data, bytes_written >> 2, RegClass(RegType::vgpr, size));
4631          bld.ds(op, address_offset, val, m, offset);
4632       }
4633
4634       bytes_written += size * 4;
4635    }
4636 }
4637
4638 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4639 {
4640    unsigned offset = nir_intrinsic_base(instr);
4641    unsigned writemask = nir_intrinsic_write_mask(instr);
4642    Operand m = load_lds_size_m0(ctx);
4643    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4644    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4645    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4646    assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4647
4648    /* we need at most two stores for 32bit variables */
4649    int start[2], count[2];
4650    u_bit_scan_consecutive_range(&writemask, &start[0], &count[0]);
4651    u_bit_scan_consecutive_range(&writemask, &start[1], &count[1]);
4652    assert(writemask == 0);
4653
4654    /* one combined store is sufficient */
4655    if (count[0] == count[1]) {
4656       Builder bld(ctx->program, ctx->block);
4657
4658       Temp address_offset = address;
4659       if ((offset >> 2) + start[1] > 255) {
4660          address_offset = bld.vadd32(bld.def(v1), Operand(offset), address_offset);
4661          offset = 0;
4662       }
4663
4664       assert(count[0] == 1);
4665       Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
4666       Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
4667       aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4668       offset = offset / elem_size_bytes;
4669       bld.ds(op, address_offset, val0, val1, m,
4670              offset + start[0], offset + start[1]);
4671       return;
4672    }
4673
4674    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4675    for (unsigned i = 0; i < 2; i++) {
4676       if (count[i] == 0)
4677          continue;
4678
4679       Temp write_data = emit_extract_vector(ctx, data, start[i], RegClass(RegType::vgpr, count[i] * elem_size_bytes / 4));
4680       ds_write_helper(ctx, m, address, write_data, offset, start[i] * elem_size_bytes, align);
4681    }
4682    return;
4683 }
4684
4685 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4686 {
4687    unsigned offset = nir_intrinsic_base(instr);
4688    Operand m = load_lds_size_m0(ctx);
4689    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4690    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4691
4692    unsigned num_operands = 3;
4693    aco_opcode op32, op64, op32_rtn, op64_rtn;
4694    switch(instr->intrinsic) {
4695       case nir_intrinsic_shared_atomic_add:
4696          op32 = aco_opcode::ds_add_u32;
4697          op64 = aco_opcode::ds_add_u64;
4698          op32_rtn = aco_opcode::ds_add_rtn_u32;
4699          op64_rtn = aco_opcode::ds_add_rtn_u64;
4700          break;
4701       case nir_intrinsic_shared_atomic_imin:
4702          op32 = aco_opcode::ds_min_i32;
4703          op64 = aco_opcode::ds_min_i64;
4704          op32_rtn = aco_opcode::ds_min_rtn_i32;
4705          op64_rtn = aco_opcode::ds_min_rtn_i64;
4706          break;
4707       case nir_intrinsic_shared_atomic_umin:
4708          op32 = aco_opcode::ds_min_u32;
4709          op64 = aco_opcode::ds_min_u64;
4710          op32_rtn = aco_opcode::ds_min_rtn_u32;
4711          op64_rtn = aco_opcode::ds_min_rtn_u64;
4712          break;
4713       case nir_intrinsic_shared_atomic_imax:
4714          op32 = aco_opcode::ds_max_i32;
4715          op64 = aco_opcode::ds_max_i64;
4716          op32_rtn = aco_opcode::ds_max_rtn_i32;
4717          op64_rtn = aco_opcode::ds_max_rtn_i64;
4718          break;
4719       case nir_intrinsic_shared_atomic_umax:
4720          op32 = aco_opcode::ds_max_u32;
4721          op64 = aco_opcode::ds_max_u64;
4722          op32_rtn = aco_opcode::ds_max_rtn_u32;
4723          op64_rtn = aco_opcode::ds_max_rtn_u64;
4724          break;
4725       case nir_intrinsic_shared_atomic_and:
4726          op32 = aco_opcode::ds_and_b32;
4727          op64 = aco_opcode::ds_and_b64;
4728          op32_rtn = aco_opcode::ds_and_rtn_b32;
4729          op64_rtn = aco_opcode::ds_and_rtn_b64;
4730          break;
4731       case nir_intrinsic_shared_atomic_or:
4732          op32 = aco_opcode::ds_or_b32;
4733          op64 = aco_opcode::ds_or_b64;
4734          op32_rtn = aco_opcode::ds_or_rtn_b32;
4735          op64_rtn = aco_opcode::ds_or_rtn_b64;
4736          break;
4737       case nir_intrinsic_shared_atomic_xor:
4738          op32 = aco_opcode::ds_xor_b32;
4739          op64 = aco_opcode::ds_xor_b64;
4740          op32_rtn = aco_opcode::ds_xor_rtn_b32;
4741          op64_rtn = aco_opcode::ds_xor_rtn_b64;
4742          break;
4743       case nir_intrinsic_shared_atomic_exchange:
4744          op32 = aco_opcode::ds_write_b32;
4745          op64 = aco_opcode::ds_write_b64;
4746          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4747          op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4748          break;
4749       case nir_intrinsic_shared_atomic_comp_swap:
4750          op32 = aco_opcode::ds_cmpst_b32;
4751          op64 = aco_opcode::ds_cmpst_b64;
4752          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4753          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4754          num_operands = 4;
4755          break;
4756       default:
4757          unreachable("Unhandled shared atomic intrinsic");
4758    }
4759
4760    /* return the previous value if dest is ever used */
4761    bool return_previous = false;
4762    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4763       return_previous = true;
4764       break;
4765    }
4766    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4767       return_previous = true;
4768       break;
4769    }
4770
4771    aco_opcode op;
4772    if (data.size() == 1) {
4773       assert(instr->dest.ssa.bit_size == 32);
4774       op = return_previous ? op32_rtn : op32;
4775    } else {
4776       assert(instr->dest.ssa.bit_size == 64);
4777       op = return_previous ? op64_rtn : op64;
4778    }
4779
4780    if (offset > 65535) {
4781       Builder bld(ctx->program, ctx->block);
4782       address = bld.vadd32(bld.def(v1), Operand(offset), address);
4783       offset = 0;
4784    }
4785
4786    aco_ptr<DS_instruction> ds;
4787    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
4788    ds->operands[0] = Operand(address);
4789    ds->operands[1] = Operand(data);
4790    if (num_operands == 4)
4791       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
4792    ds->operands[num_operands - 1] = m;
4793    ds->offset0 = offset;
4794    if (return_previous)
4795       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
4796    ctx->block->instructions.emplace_back(std::move(ds));
4797 }
4798
4799 Temp get_scratch_resource(isel_context *ctx)
4800 {
4801    Builder bld(ctx->program, ctx->block);
4802    Temp scratch_addr = ctx->private_segment_buffer;
4803    if (ctx->stage != compute_cs)
4804       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
4805
4806    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
4807                         S_008F0C_INDEX_STRIDE(ctx->options->wave_size == 64 ? 3 : 2);;
4808
4809    if (ctx->program->chip_class >= GFX10) {
4810       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4811                    S_008F0C_OOB_SELECT(3) |
4812                    S_008F0C_RESOURCE_LEVEL(1);
4813    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
4814       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4815                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4816    }
4817
4818    /* older generations need element size = 16 bytes. element size removed in GFX9 */
4819    if (ctx->program->chip_class <= GFX8)
4820       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
4821
4822    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
4823 }
4824
4825 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4826    assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
4827    Builder bld(ctx->program, ctx->block);
4828    Temp rsrc = get_scratch_resource(ctx);
4829    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4830    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4831
4832    aco_opcode op;
4833    switch (dst.size()) {
4834       case 1:
4835          op = aco_opcode::buffer_load_dword;
4836          break;
4837       case 2:
4838          op = aco_opcode::buffer_load_dwordx2;
4839          break;
4840       case 3:
4841          op = aco_opcode::buffer_load_dwordx3;
4842          break;
4843       case 4:
4844          op = aco_opcode::buffer_load_dwordx4;
4845          break;
4846       case 6:
4847       case 8: {
4848          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4849          Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
4850                                 bld.def(v4), offset, rsrc,
4851                                 ctx->scratch_offset, 0, true);
4852          Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
4853                                                   aco_opcode::buffer_load_dwordx4,
4854                                 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
4855                                 offset, rsrc, ctx->scratch_offset, 16, true);
4856          emit_split_vector(ctx, lower, 2);
4857          elems[0] = emit_extract_vector(ctx, lower, 0, v2);
4858          elems[1] = emit_extract_vector(ctx, lower, 1, v2);
4859          if (dst.size() == 8) {
4860             emit_split_vector(ctx, upper, 2);
4861             elems[2] = emit_extract_vector(ctx, upper, 0, v2);
4862             elems[3] = emit_extract_vector(ctx, upper, 1, v2);
4863          } else {
4864             elems[2] = upper;
4865          }
4866
4867          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4868                                                                          Format::PSEUDO, dst.size() / 2, 1)};
4869          for (unsigned i = 0; i < dst.size() / 2; i++)
4870             vec->operands[i] = Operand(elems[i]);
4871          vec->definitions[0] = Definition(dst);
4872          bld.insert(std::move(vec));
4873          ctx->allocated_vec.emplace(dst.id(), elems);
4874          return;
4875       }
4876       default:
4877          unreachable("Wrong dst size for nir_intrinsic_load_scratch");
4878    }
4879
4880    bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
4881    emit_split_vector(ctx, dst, instr->num_components);
4882 }
4883
4884 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4885    assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
4886    Builder bld(ctx->program, ctx->block);
4887    Temp rsrc = get_scratch_resource(ctx);
4888    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4889    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4890
4891    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4892    unsigned writemask = nir_intrinsic_write_mask(instr);
4893
4894    while (writemask) {
4895       int start, count;
4896       u_bit_scan_consecutive_range(&writemask, &start, &count);
4897       int num_bytes = count * elem_size_bytes;
4898
4899       if (num_bytes > 16) {
4900          assert(elem_size_bytes == 8);
4901          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4902          count = 2;
4903          num_bytes = 16;
4904       }
4905
4906       // TODO: check alignment of sub-dword stores
4907       // TODO: split 3 bytes. there is no store instruction for that
4908
4909       Temp write_data;
4910       if (count != instr->num_components) {
4911          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4912          for (int i = 0; i < count; i++) {
4913             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
4914             vec->operands[i] = Operand(elem);
4915          }
4916          write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
4917          vec->definitions[0] = Definition(write_data);
4918          ctx->block->instructions.emplace_back(std::move(vec));
4919       } else {
4920          write_data = data;
4921       }
4922
4923       aco_opcode op;
4924       switch (num_bytes) {
4925          case 4:
4926             op = aco_opcode::buffer_store_dword;
4927             break;
4928          case 8:
4929             op = aco_opcode::buffer_store_dwordx2;
4930             break;
4931          case 12:
4932             op = aco_opcode::buffer_store_dwordx3;
4933             break;
4934          case 16:
4935             op = aco_opcode::buffer_store_dwordx4;
4936             break;
4937          default:
4938             unreachable("Invalid data size for nir_intrinsic_store_scratch.");
4939       }
4940
4941       bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
4942    }
4943 }
4944
4945 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
4946    uint8_t log2_ps_iter_samples;
4947    if (ctx->program->info->ps.force_persample) {
4948       log2_ps_iter_samples =
4949          util_logbase2(ctx->options->key.fs.num_samples);
4950    } else {
4951       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
4952    }
4953
4954    /* The bit pattern matches that used by fixed function fragment
4955     * processing. */
4956    static const unsigned ps_iter_masks[] = {
4957       0xffff, /* not used */
4958       0x5555,
4959       0x1111,
4960       0x0101,
4961       0x0001,
4962    };
4963    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
4964
4965    Builder bld(ctx->program, ctx->block);
4966
4967    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
4968    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
4969    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
4970    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4971    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
4972 }
4973
4974 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
4975 {
4976    Builder bld(ctx->program, ctx->block);
4977
4978    if (cluster_size == 1) {
4979       return src;
4980    } if (op == nir_op_iand && cluster_size == 4) {
4981       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
4982       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
4983       return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
4984                       bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
4985    } else if (op == nir_op_ior && cluster_size == 4) {
4986       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
4987       return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
4988                       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
4989    } else if (op == nir_op_iand && cluster_size == 64) {
4990       //subgroupAnd(val) -> (exec & ~val) == 0
4991       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
4992       return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
4993    } else if (op == nir_op_ior && cluster_size == 64) {
4994       //subgroupOr(val) -> (val & exec) != 0
4995       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
4996    } else if (op == nir_op_ixor && cluster_size == 64) {
4997       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
4998       Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
4999       tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
5000       return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5001    } else {
5002       //subgroupClustered{And,Or,Xor}(val, n) ->
5003       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5004       //cluster_offset = ~(n - 1) & lane_id
5005       //cluster_mask = ((1 << n) - 1)
5006       //subgroupClusteredAnd():
5007       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5008       //subgroupClusteredOr():
5009       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
5010       //subgroupClusteredXor():
5011       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5012       Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5013                               bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5014       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5015
5016       Temp tmp;
5017       if (op == nir_op_iand)
5018          tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5019       else
5020          tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5021
5022       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5023       tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5024       tmp = emit_extract_vector(ctx, tmp, 0, v1);
5025       if (cluster_mask != 0xffffffff)
5026          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5027
5028       Definition cmp_def = Definition();
5029       if (op == nir_op_iand) {
5030          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5031       } else if (op == nir_op_ior) {
5032          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5033       } else if (op == nir_op_ixor) {
5034          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5035                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5036          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5037       }
5038       cmp_def.setHint(vcc);
5039       return cmp_def.getTemp();
5040    }
5041 }
5042
5043 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5044 {
5045    Builder bld(ctx->program, ctx->block);
5046
5047    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5048    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5049    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5050    Temp tmp;
5051    if (op == nir_op_iand)
5052       tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5053    else
5054       tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5055
5056    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5057    Temp lo = lohi.def(0).getTemp();
5058    Temp hi = lohi.def(1).getTemp();
5059    Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5060                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5061
5062    Definition cmp_def = Definition();
5063    if (op == nir_op_iand)
5064       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5065    else if (op == nir_op_ior)
5066       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5067    else if (op == nir_op_ixor)
5068       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5069                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5070    cmp_def.setHint(vcc);
5071    return cmp_def.getTemp();
5072 }
5073
5074 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5075 {
5076    Builder bld(ctx->program, ctx->block);
5077
5078    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5079    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5080    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5081    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5082    if (op == nir_op_iand)
5083       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5084    else if (op == nir_op_ior)
5085       return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5086    else if (op == nir_op_ixor)
5087       return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5088
5089    assert(false);
5090    return Temp();
5091 }
5092
5093 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5094 {
5095    Builder bld(ctx->program, ctx->block);
5096    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5097    if (src.regClass().type() == RegType::vgpr) {
5098       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5099    } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5100       bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
5101    } else if (src.regClass() == s1) {
5102       bld.sop1(aco_opcode::s_mov_b32, dst, src);
5103    } else if (src.regClass() == s2) {
5104       bld.sop1(aco_opcode::s_mov_b64, dst, src);
5105    } else {
5106       fprintf(stderr, "Unimplemented NIR instr bit size: ");
5107       nir_print_instr(&instr->instr, stderr);
5108       fprintf(stderr, "\n");
5109    }
5110 }
5111
5112 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5113 {
5114    Builder bld(ctx->program, ctx->block);
5115    Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
5116    Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
5117
5118    /* Build DD X/Y */
5119    Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5120    Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5121    Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5122    Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5123    Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5124    Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5125
5126    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5127    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5128    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5129    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5130    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5131    Temp wqm1 = bld.tmp(v1);
5132    emit_wqm(ctx, tmp1, wqm1, true);
5133    Temp wqm2 = bld.tmp(v1);
5134    emit_wqm(ctx, tmp2, wqm2, true);
5135    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5136    return;
5137 }
5138
5139 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5140 {
5141    Builder bld(ctx->program, ctx->block);
5142    switch(instr->intrinsic) {
5143    case nir_intrinsic_load_barycentric_sample:
5144    case nir_intrinsic_load_barycentric_pixel:
5145    case nir_intrinsic_load_barycentric_centroid: {
5146       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5147       fs_input input = get_interp_input(instr->intrinsic, mode);
5148
5149       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5150       if (input == fs_input::max_inputs) {
5151          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5152                     Operand(0u), Operand(0u));
5153       } else {
5154          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5155                     ctx->fs_inputs[input],
5156                     ctx->fs_inputs[input + 1]);
5157       }
5158       emit_split_vector(ctx, dst, 2);
5159       break;
5160    }
5161    case nir_intrinsic_load_barycentric_at_sample: {
5162       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5163       switch (ctx->options->key.fs.num_samples) {
5164          case 2: sample_pos_offset += 1 << 3; break;
5165          case 4: sample_pos_offset += 3 << 3; break;
5166          case 8: sample_pos_offset += 7 << 3; break;
5167          default: break;
5168       }
5169       Temp sample_pos;
5170       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5171       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5172       if (addr.type() == RegType::sgpr) {
5173          Operand offset;
5174          if (const_addr) {
5175             sample_pos_offset += const_addr->u32 << 3;
5176             offset = Operand(sample_pos_offset);
5177          } else if (ctx->options->chip_class >= GFX9) {
5178             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5179          } else {
5180             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5181             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5182          }
5183          addr = ctx->private_segment_buffer;
5184          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
5185
5186       } else if (ctx->options->chip_class >= GFX9) {
5187          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5188          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
5189       } else {
5190          /* addr += ctx->private_segment_buffer + sample_pos_offset */
5191          Temp tmp0 = bld.tmp(s1);
5192          Temp tmp1 = bld.tmp(s1);
5193          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
5194          Definition scc_tmp = bld.def(s1, scc);
5195          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5196          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp());
5197          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5198          Temp pck0 = bld.tmp(v1);
5199          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5200          tmp1 = as_vgpr(ctx, tmp1);
5201          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5202          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5203
5204          /* sample_pos = flat_load_dwordx2 addr */
5205          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5206       }
5207
5208       /* sample_pos -= 0.5 */
5209       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5210       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5211       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5212       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5213       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5214
5215       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5216       break;
5217    }
5218    case nir_intrinsic_load_barycentric_at_offset: {
5219       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5220       RegClass rc = RegClass(offset.type(), 1);
5221       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5222       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5223       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5224       break;
5225    }
5226    case nir_intrinsic_load_front_face: {
5227       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5228                Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
5229       break;
5230    }
5231    case nir_intrinsic_load_view_index:
5232    case nir_intrinsic_load_layer_id: {
5233       if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5234          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5235          bld.copy(Definition(dst), Operand(ctx->view_index));
5236          break;
5237       }
5238
5239       unsigned idx = nir_intrinsic_base(instr);
5240       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5241                  Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
5242       break;
5243    }
5244    case nir_intrinsic_load_frag_coord: {
5245       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5246       break;
5247    }
5248    case nir_intrinsic_load_sample_pos: {
5249       Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
5250       Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
5251       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5252                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5253                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5254       break;
5255    }
5256    case nir_intrinsic_load_interpolated_input:
5257       visit_load_interpolated_input(ctx, instr);
5258       break;
5259    case nir_intrinsic_store_output:
5260       visit_store_output(ctx, instr);
5261       break;
5262    case nir_intrinsic_load_input:
5263       visit_load_input(ctx, instr);
5264       break;
5265    case nir_intrinsic_load_ubo:
5266       visit_load_ubo(ctx, instr);
5267       break;
5268    case nir_intrinsic_load_push_constant:
5269       visit_load_push_constant(ctx, instr);
5270       break;
5271    case nir_intrinsic_load_constant:
5272       visit_load_constant(ctx, instr);
5273       break;
5274    case nir_intrinsic_vulkan_resource_index:
5275       visit_load_resource(ctx, instr);
5276       break;
5277    case nir_intrinsic_discard:
5278       visit_discard(ctx, instr);
5279       break;
5280    case nir_intrinsic_discard_if:
5281       visit_discard_if(ctx, instr);
5282       break;
5283    case nir_intrinsic_load_shared:
5284       visit_load_shared(ctx, instr);
5285       break;
5286    case nir_intrinsic_store_shared:
5287       visit_store_shared(ctx, instr);
5288       break;
5289    case nir_intrinsic_shared_atomic_add:
5290    case nir_intrinsic_shared_atomic_imin:
5291    case nir_intrinsic_shared_atomic_umin:
5292    case nir_intrinsic_shared_atomic_imax:
5293    case nir_intrinsic_shared_atomic_umax:
5294    case nir_intrinsic_shared_atomic_and:
5295    case nir_intrinsic_shared_atomic_or:
5296    case nir_intrinsic_shared_atomic_xor:
5297    case nir_intrinsic_shared_atomic_exchange:
5298    case nir_intrinsic_shared_atomic_comp_swap:
5299       visit_shared_atomic(ctx, instr);
5300       break;
5301    case nir_intrinsic_image_deref_load:
5302       visit_image_load(ctx, instr);
5303       break;
5304    case nir_intrinsic_image_deref_store:
5305       visit_image_store(ctx, instr);
5306       break;
5307    case nir_intrinsic_image_deref_atomic_add:
5308    case nir_intrinsic_image_deref_atomic_umin:
5309    case nir_intrinsic_image_deref_atomic_imin:
5310    case nir_intrinsic_image_deref_atomic_umax:
5311    case nir_intrinsic_image_deref_atomic_imax:
5312    case nir_intrinsic_image_deref_atomic_and:
5313    case nir_intrinsic_image_deref_atomic_or:
5314    case nir_intrinsic_image_deref_atomic_xor:
5315    case nir_intrinsic_image_deref_atomic_exchange:
5316    case nir_intrinsic_image_deref_atomic_comp_swap:
5317       visit_image_atomic(ctx, instr);
5318       break;
5319    case nir_intrinsic_image_deref_size:
5320       visit_image_size(ctx, instr);
5321       break;
5322    case nir_intrinsic_load_ssbo:
5323       visit_load_ssbo(ctx, instr);
5324       break;
5325    case nir_intrinsic_store_ssbo:
5326       visit_store_ssbo(ctx, instr);
5327       break;
5328    case nir_intrinsic_load_global:
5329       visit_load_global(ctx, instr);
5330       break;
5331    case nir_intrinsic_store_global:
5332       visit_store_global(ctx, instr);
5333       break;
5334    case nir_intrinsic_ssbo_atomic_add:
5335    case nir_intrinsic_ssbo_atomic_imin:
5336    case nir_intrinsic_ssbo_atomic_umin:
5337    case nir_intrinsic_ssbo_atomic_imax:
5338    case nir_intrinsic_ssbo_atomic_umax:
5339    case nir_intrinsic_ssbo_atomic_and:
5340    case nir_intrinsic_ssbo_atomic_or:
5341    case nir_intrinsic_ssbo_atomic_xor:
5342    case nir_intrinsic_ssbo_atomic_exchange:
5343    case nir_intrinsic_ssbo_atomic_comp_swap:
5344       visit_atomic_ssbo(ctx, instr);
5345       break;
5346    case nir_intrinsic_load_scratch:
5347       visit_load_scratch(ctx, instr);
5348       break;
5349    case nir_intrinsic_store_scratch:
5350       visit_store_scratch(ctx, instr);
5351       break;
5352    case nir_intrinsic_get_buffer_size:
5353       visit_get_buffer_size(ctx, instr);
5354       break;
5355    case nir_intrinsic_barrier: {
5356       unsigned* bsize = ctx->program->info->cs.block_size;
5357       unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5358       if (workgroup_size > 64)
5359          bld.sopp(aco_opcode::s_barrier);
5360       break;
5361    }
5362    case nir_intrinsic_group_memory_barrier:
5363    case nir_intrinsic_memory_barrier:
5364    case nir_intrinsic_memory_barrier_atomic_counter:
5365    case nir_intrinsic_memory_barrier_buffer:
5366    case nir_intrinsic_memory_barrier_image:
5367    case nir_intrinsic_memory_barrier_shared:
5368       emit_memory_barrier(ctx, instr);
5369       break;
5370    case nir_intrinsic_load_num_work_groups:
5371    case nir_intrinsic_load_work_group_id:
5372    case nir_intrinsic_load_local_invocation_id: {
5373       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5374       Temp* ids;
5375       if (instr->intrinsic == nir_intrinsic_load_num_work_groups)
5376          ids = ctx->num_workgroups;
5377       else if (instr->intrinsic == nir_intrinsic_load_work_group_id)
5378          ids = ctx->workgroup_ids;
5379       else
5380          ids = ctx->local_invocation_ids;
5381       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5382                  ids[0].id() ? Operand(ids[0]) : Operand(1u),
5383                  ids[1].id() ? Operand(ids[1]) : Operand(1u),
5384                  ids[2].id() ? Operand(ids[2]) : Operand(1u));
5385       emit_split_vector(ctx, dst, 3);
5386       break;
5387    }
5388    case nir_intrinsic_load_local_invocation_index: {
5389       Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5390                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5391       Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5392       bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5393       break;
5394    }
5395    case nir_intrinsic_load_subgroup_id: {
5396       if (ctx->stage == compute_cs) {
5397          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5398          bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
5399       } else {
5400          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5401       }
5402       break;
5403    }
5404    case nir_intrinsic_load_subgroup_invocation: {
5405       bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5406                bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5407       break;
5408    }
5409    case nir_intrinsic_load_num_subgroups: {
5410       if (ctx->stage == compute_cs)
5411          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
5412       else
5413          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5414       break;
5415    }
5416    case nir_intrinsic_ballot: {
5417       Definition tmp = bld.def(s2);
5418       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5419       if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
5420          bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5421       } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
5422          bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
5423       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5424          bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5425       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5426          bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5427       } else {
5428          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5429          nir_print_instr(&instr->instr, stderr);
5430          fprintf(stderr, "\n");
5431       }
5432       emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5433       break;
5434    }
5435    case nir_intrinsic_shuffle: {
5436       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5437       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5438          emit_uniform_subgroup(ctx, instr, src);
5439       } else {
5440          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5441          assert(tid.regClass() == v1);
5442          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5443          if (src.regClass() == v1) {
5444             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5445             emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
5446          } else if (src.regClass() == v2) {
5447             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5448
5449             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5450             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5451             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
5452             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
5453             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5454             emit_split_vector(ctx, dst, 2);
5455          } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5456             Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5457             tmp = emit_extract_vector(ctx, tmp, 0, v1);
5458             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5459             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5460          } else {
5461             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5462             nir_print_instr(&instr->instr, stderr);
5463             fprintf(stderr, "\n");
5464          }
5465       }
5466       break;
5467    }
5468    case nir_intrinsic_load_sample_id: {
5469       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5470                ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
5471       break;
5472    }
5473    case nir_intrinsic_load_sample_mask_in: {
5474       visit_load_sample_mask_in(ctx, instr);
5475       break;
5476    }
5477    case nir_intrinsic_read_first_invocation: {
5478       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5479       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5480       if (src.regClass() == v1) {
5481          emit_wqm(ctx,
5482                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5483                   dst);
5484       } else if (src.regClass() == v2) {
5485          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5486          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5487          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5488          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5489          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5490          emit_split_vector(ctx, dst, 2);
5491       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5492          emit_wqm(ctx,
5493                   bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5494                            bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
5495                   dst);
5496       } else if (src.regClass() == s1) {
5497          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5498       } else if (src.regClass() == s2) {
5499          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5500       } else {
5501          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5502          nir_print_instr(&instr->instr, stderr);
5503          fprintf(stderr, "\n");
5504       }
5505       break;
5506    }
5507    case nir_intrinsic_read_invocation: {
5508       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5509       Temp lane = get_ssa_temp(ctx, instr->src[1].ssa);
5510       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5511       assert(lane.regClass() == s1);
5512       if (src.regClass() == v1) {
5513          emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst);
5514       } else if (src.regClass() == v2) {
5515          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5516          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5517          lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane));
5518          hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane));
5519          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5520          emit_split_vector(ctx, dst, 2);
5521       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5522          emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst);
5523       } else if (src.regClass() == s1) {
5524          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5525       } else if (src.regClass() == s2) {
5526          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5527       } else {
5528          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5529          nir_print_instr(&instr->instr, stderr);
5530          fprintf(stderr, "\n");
5531       }
5532       break;
5533    }
5534    case nir_intrinsic_vote_all: {
5535       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5536       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5537       assert(src.regClass() == s2);
5538       assert(dst.regClass() == s1);
5539
5540       Definition tmp = bld.def(s1);
5541       bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
5542                bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
5543                Operand(exec, s2));
5544       emit_wqm(ctx, tmp.getTemp(), dst);
5545       break;
5546    }
5547    case nir_intrinsic_vote_any: {
5548       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5549       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5550       assert(src.regClass() == s2);
5551       assert(dst.regClass() == s1);
5552
5553       Definition tmp = bld.def(s1);
5554       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
5555       emit_wqm(ctx, tmp.getTemp(), dst);
5556       break;
5557    }
5558    case nir_intrinsic_reduce:
5559    case nir_intrinsic_inclusive_scan:
5560    case nir_intrinsic_exclusive_scan: {
5561       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5562       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5563       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5564       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5565          nir_intrinsic_cluster_size(instr) : 0;
5566       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5567
5568       if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5569          emit_uniform_subgroup(ctx, instr, src);
5570       } else if (instr->dest.ssa.bit_size == 1) {
5571          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5572             op = nir_op_iand;
5573          else if (op == nir_op_iadd)
5574             op = nir_op_ixor;
5575          else if (op == nir_op_umax || op == nir_op_imax)
5576             op = nir_op_ior;
5577          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5578
5579          switch (instr->intrinsic) {
5580          case nir_intrinsic_reduce:
5581             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5582             break;
5583          case nir_intrinsic_exclusive_scan:
5584             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5585             break;
5586          case nir_intrinsic_inclusive_scan:
5587             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5588             break;
5589          default:
5590             assert(false);
5591          }
5592       } else if (cluster_size == 1) {
5593          bld.copy(Definition(dst), src);
5594       } else {
5595          src = as_vgpr(ctx, src);
5596
5597          ReduceOp reduce_op;
5598          switch (op) {
5599          #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5600             CASE(iadd)
5601             CASE(imul)
5602             CASE(fadd)
5603             CASE(fmul)
5604             CASE(imin)
5605             CASE(umin)
5606             CASE(fmin)
5607             CASE(imax)
5608             CASE(umax)
5609             CASE(fmax)
5610             CASE(iand)
5611             CASE(ior)
5612             CASE(ixor)
5613             default:
5614                unreachable("unknown reduction op");
5615          #undef CASE
5616          }
5617
5618          aco_opcode aco_op;
5619          switch (instr->intrinsic) {
5620             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5621             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5622             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5623             default:
5624                unreachable("unknown reduce intrinsic");
5625          }
5626
5627          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5628          reduce->operands[0] = Operand(src);
5629          // filled in by aco_reduce_assign.cpp, used internally as part of the
5630          // reduce sequence
5631          assert(dst.size() == 1 || dst.size() == 2);
5632          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5633          reduce->operands[2] = Operand(v1.as_linear());
5634
5635          Temp tmp_dst = bld.tmp(dst.regClass());
5636          reduce->definitions[0] = Definition(tmp_dst);
5637          reduce->definitions[1] = bld.def(s2); // used internally
5638          reduce->definitions[2] = Definition();
5639          reduce->definitions[3] = Definition(scc, s1);
5640          reduce->definitions[4] = Definition();
5641          reduce->reduce_op = reduce_op;
5642          reduce->cluster_size = cluster_size;
5643          ctx->block->instructions.emplace_back(std::move(reduce));
5644
5645          emit_wqm(ctx, tmp_dst, dst);
5646       }
5647       break;
5648    }
5649    case nir_intrinsic_quad_broadcast: {
5650       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5651       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5652          emit_uniform_subgroup(ctx, instr, src);
5653       } else {
5654          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5655          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5656          if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5657             uint32_t half_mask = 0x11111111u << lane;
5658             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5659             Temp tmp = bld.tmp(s2);
5660             bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5661                      bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5662                               bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5663             emit_wqm(ctx, tmp, dst);
5664          } else if (instr->dest.ssa.bit_size == 32) {
5665             emit_wqm(ctx,
5666                      bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5667                                   dpp_quad_perm(lane, lane, lane, lane)),
5668                      dst);
5669          } else if (instr->dest.ssa.bit_size == 64) {
5670             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5671             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5672             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5673             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5674             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5675             emit_split_vector(ctx, dst, 2);
5676          } else {
5677             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5678             nir_print_instr(&instr->instr, stderr);
5679             fprintf(stderr, "\n");
5680          }
5681       }
5682       break;
5683    }
5684    case nir_intrinsic_quad_swap_horizontal:
5685    case nir_intrinsic_quad_swap_vertical:
5686    case nir_intrinsic_quad_swap_diagonal:
5687    case nir_intrinsic_quad_swizzle_amd: {
5688       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5689       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5690          emit_uniform_subgroup(ctx, instr, src);
5691          break;
5692       }
5693       uint16_t dpp_ctrl = 0;
5694       switch (instr->intrinsic) {
5695       case nir_intrinsic_quad_swap_horizontal:
5696          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5697          break;
5698       case nir_intrinsic_quad_swap_vertical:
5699          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5700          break;
5701       case nir_intrinsic_quad_swap_diagonal:
5702          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5703          break;
5704       case nir_intrinsic_quad_swizzle_amd: {
5705          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5706          break;
5707       }
5708       default:
5709          break;
5710       }
5711
5712       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5713       if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5714          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5715          src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5716          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5717          emit_wqm(ctx, tmp, dst);
5718       } else if (instr->dest.ssa.bit_size == 32) {
5719          Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5720          emit_wqm(ctx, tmp, dst);
5721       } else if (instr->dest.ssa.bit_size == 64) {
5722          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5723          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5724          lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5725          hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5726          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5727          emit_split_vector(ctx, dst, 2);
5728       } else {
5729          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5730          nir_print_instr(&instr->instr, stderr);
5731          fprintf(stderr, "\n");
5732       }
5733       break;
5734    }
5735    case nir_intrinsic_masked_swizzle_amd: {
5736       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5737       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5738          emit_uniform_subgroup(ctx, instr, src);
5739          break;
5740       }
5741       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5742       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5743       if (dst.regClass() == v1) {
5744          emit_wqm(ctx,
5745                   bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5746                   dst);
5747       } else if (dst.regClass() == v2) {
5748          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5749          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5750          lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5751          hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5752          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5753          emit_split_vector(ctx, dst, 2);
5754       } else {
5755          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5756          nir_print_instr(&instr->instr, stderr);
5757          fprintf(stderr, "\n");
5758       }
5759       break;
5760    }
5761    case nir_intrinsic_write_invocation_amd: {
5762       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5763       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
5764       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
5765       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5766       if (dst.regClass() == v1) {
5767          /* src2 is ignored for writelane. RA assigns the same reg for dst */
5768          emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
5769       } else if (dst.regClass() == v2) {
5770          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
5771          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
5772          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
5773          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
5774          Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
5775          Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
5776          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5777          emit_split_vector(ctx, dst, 2);
5778       } else {
5779          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5780          nir_print_instr(&instr->instr, stderr);
5781          fprintf(stderr, "\n");
5782       }
5783       break;
5784    }
5785    case nir_intrinsic_mbcnt_amd: {
5786       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5787       RegClass rc = RegClass(src.type(), 1);
5788       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
5789       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
5790       Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
5791       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5792       Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
5793       emit_wqm(ctx, wqm_tmp, dst);
5794       break;
5795    }
5796    case nir_intrinsic_load_helper_invocation: {
5797       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5798       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
5799       ctx->block->kind |= block_kind_needs_lowering;
5800       ctx->program->needs_exact = true;
5801       break;
5802    }
5803    case nir_intrinsic_is_helper_invocation: {
5804       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5805       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
5806       ctx->block->kind |= block_kind_needs_lowering;
5807       ctx->program->needs_exact = true;
5808       break;
5809    }
5810    case nir_intrinsic_demote:
5811       bld.pseudo(aco_opcode::p_demote_to_helper);
5812       ctx->block->kind |= block_kind_uses_demote;
5813       ctx->program->needs_exact = true;
5814       break;
5815    case nir_intrinsic_demote_if: {
5816       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
5817                            as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false),
5818                            Operand(exec, s2));
5819       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
5820       ctx->block->kind |= block_kind_uses_demote;
5821       ctx->program->needs_exact = true;
5822       break;
5823    }
5824    case nir_intrinsic_first_invocation: {
5825       emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
5826                get_ssa_temp(ctx, &instr->dest.ssa));
5827       break;
5828    }
5829    case nir_intrinsic_shader_clock:
5830       bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
5831       break;
5832    case nir_intrinsic_load_vertex_id_zero_base: {
5833       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5834       bld.copy(Definition(dst), ctx->vertex_id);
5835       break;
5836    }
5837    case nir_intrinsic_load_first_vertex: {
5838       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5839       bld.copy(Definition(dst), ctx->base_vertex);
5840       break;
5841    }
5842    case nir_intrinsic_load_base_instance: {
5843       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5844       bld.copy(Definition(dst), ctx->start_instance);
5845       break;
5846    }
5847    case nir_intrinsic_load_instance_id: {
5848       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5849       bld.copy(Definition(dst), ctx->instance_id);
5850       break;
5851    }
5852    case nir_intrinsic_load_draw_id: {
5853       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5854       bld.copy(Definition(dst), ctx->draw_id);
5855       break;
5856    }
5857    default:
5858       fprintf(stderr, "Unimplemented intrinsic instr: ");
5859       nir_print_instr(&instr->instr, stderr);
5860       fprintf(stderr, "\n");
5861       abort();
5862
5863       break;
5864    }
5865 }
5866
5867
5868 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
5869                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
5870                     enum glsl_base_type *stype)
5871 {
5872    nir_deref_instr *texture_deref_instr = NULL;
5873    nir_deref_instr *sampler_deref_instr = NULL;
5874    int plane = -1;
5875
5876    for (unsigned i = 0; i < instr->num_srcs; i++) {
5877       switch (instr->src[i].src_type) {
5878       case nir_tex_src_texture_deref:
5879          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
5880          break;
5881       case nir_tex_src_sampler_deref:
5882          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
5883          break;
5884       case nir_tex_src_plane:
5885          plane = nir_src_as_int(instr->src[i].src);
5886          break;
5887       default:
5888          break;
5889       }
5890    }
5891
5892    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
5893
5894    if (!sampler_deref_instr)
5895       sampler_deref_instr = texture_deref_instr;
5896
5897    if (plane >= 0) {
5898       assert(instr->op != nir_texop_txf_ms &&
5899              instr->op != nir_texop_samples_identical);
5900       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
5901       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
5902    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
5903       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
5904    } else {
5905       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
5906    }
5907    if (samp_ptr) {
5908       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
5909       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
5910          fprintf(stderr, "Unimplemented sampler descriptor: ");
5911          nir_print_instr(&instr->instr, stderr);
5912          fprintf(stderr, "\n");
5913          abort();
5914          // TODO: build samp_ptr = and(samp_ptr, res_ptr)
5915       }
5916    }
5917    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
5918                      instr->op == nir_texop_samples_identical))
5919       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
5920 }
5921
5922 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
5923                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
5924 {
5925    Builder bld(ctx->program, ctx->block);
5926
5927    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
5928    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
5929    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
5930
5931    Operand neg_one(0xbf800000u);
5932    Operand one(0x3f800000u);
5933    Operand two(0x40000000u);
5934    Operand four(0x40800000u);
5935
5936    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
5937    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
5938    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
5939
5940    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
5941    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
5942    is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
5943    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
5944
5945    // select sc
5946    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
5947    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
5948                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
5949                        one, is_ma_y);
5950    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5951
5952    // select tc
5953    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
5954    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
5955    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5956
5957    // select ma
5958    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
5959                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
5960                   deriv_z, is_ma_z);
5961    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
5962    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
5963 }
5964
5965 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
5966 {
5967    Builder bld(ctx->program, ctx->block);
5968    Temp coord_args[4], ma, tc, sc, id;
5969    for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
5970       coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
5971
5972    if (is_array) {
5973       coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
5974
5975       // see comment in ac_prepare_cube_coords()
5976       if (ctx->options->chip_class <= GFX8)
5977          coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
5978    }
5979
5980    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
5981
5982    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
5983    vop3a->operands[0] = Operand(ma);
5984    vop3a->abs[0] = true;
5985    Temp invma = bld.tmp(v1);
5986    vop3a->definitions[0] = Definition(invma);
5987    ctx->block->instructions.emplace_back(std::move(vop3a));
5988
5989    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
5990    if (!is_deriv)
5991       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
5992
5993    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
5994    if (!is_deriv)
5995       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
5996
5997    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
5998
5999    if (is_deriv) {
6000       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6001       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6002
6003       for (unsigned i = 0; i < 2; i++) {
6004          // see comment in ac_prepare_cube_coords()
6005          Temp deriv_ma;
6006          Temp deriv_sc, deriv_tc;
6007          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6008                            &deriv_ma, &deriv_sc, &deriv_tc);
6009
6010          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6011
6012          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6013                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6014                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6015          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6016                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6017                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6018          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6019       }
6020
6021       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6022       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6023    }
6024
6025    if (is_array)
6026       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6027    *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6028
6029 }
6030
6031 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6032 {
6033    Temp coord_vec[3];
6034    for (unsigned i = 0; i < coords.size(); i++)
6035       coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6036
6037    Builder bld(ctx->program, ctx->block);
6038    coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6039
6040    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6041    for (unsigned i = 0; i < coords.size(); i++)
6042       vec->operands[i] = Operand(coord_vec[i]);
6043    Temp res = bld.tmp(RegType::vgpr, coords.size());
6044    vec->definitions[0] = Definition(res);
6045    ctx->block->instructions.emplace_back(std::move(vec));
6046    return res;
6047 }
6048
6049 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6050 {
6051    if (vec->parent_instr->type != nir_instr_type_alu)
6052       return;
6053    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6054    if (vec_instr->op != nir_op_vec(vec->num_components))
6055       return;
6056
6057    for (unsigned i = 0; i < vec->num_components; i++) {
6058       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6059               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6060    }
6061 }
6062
6063 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6064 {
6065    Builder bld(ctx->program, ctx->block);
6066    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6067         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6068    Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6069         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6070    nir_const_value *sample_index_cv = NULL;
6071    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6072    enum glsl_base_type stype;
6073    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6074
6075    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6076                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6077    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6078                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6079
6080    for (unsigned i = 0; i < instr->num_srcs; i++) {
6081       switch (instr->src[i].src_type) {
6082       case nir_tex_src_coord:
6083          coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6084          break;
6085       case nir_tex_src_bias:
6086          if (instr->op == nir_texop_txb) {
6087             bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6088             has_bias = true;
6089          }
6090          break;
6091       case nir_tex_src_lod: {
6092          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6093
6094          if (val && val->f32 <= 0.0) {
6095             level_zero = true;
6096          } else {
6097             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6098             has_lod = true;
6099          }
6100          break;
6101       }
6102       case nir_tex_src_comparator:
6103          if (instr->is_shadow) {
6104             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6105             has_compare = true;
6106          }
6107          break;
6108       case nir_tex_src_offset:
6109          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6110          get_const_vec(instr->src[i].src.ssa, const_offset);
6111          has_offset = true;
6112          break;
6113       case nir_tex_src_ddx:
6114          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6115          has_ddx = true;
6116          break;
6117       case nir_tex_src_ddy:
6118          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6119          has_ddy = true;
6120          break;
6121       case nir_tex_src_ms_index:
6122          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6123          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6124          has_sample_index = true;
6125          break;
6126       case nir_tex_src_texture_offset:
6127       case nir_tex_src_sampler_offset:
6128       default:
6129          break;
6130       }
6131    }
6132 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6133    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6134       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6135
6136    if (instr->op == nir_texop_texture_samples) {
6137       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6138
6139       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6140       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6141       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6142       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6143
6144       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6145                samples, Operand(1u), bld.scc(is_msaa));
6146       return;
6147    }
6148
6149    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6150       aco_ptr<Instruction> tmp_instr;
6151       Temp acc, pack = Temp();
6152
6153       uint32_t pack_const = 0;
6154       for (unsigned i = 0; i < offset.size(); i++) {
6155          if (!const_offset[i])
6156             continue;
6157          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6158       }
6159
6160       if (offset.type() == RegType::sgpr) {
6161          for (unsigned i = 0; i < offset.size(); i++) {
6162             if (const_offset[i])
6163                continue;
6164
6165             acc = emit_extract_vector(ctx, offset, i, s1);
6166             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6167
6168             if (i) {
6169                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6170             }
6171
6172             if (pack == Temp()) {
6173                pack = acc;
6174             } else {
6175                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6176             }
6177          }
6178
6179          if (pack_const && pack != Temp())
6180             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6181       } else {
6182          for (unsigned i = 0; i < offset.size(); i++) {
6183             if (const_offset[i])
6184                continue;
6185
6186             acc = emit_extract_vector(ctx, offset, i, v1);
6187             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6188
6189             if (i) {
6190                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6191             }
6192
6193             if (pack == Temp()) {
6194                pack = acc;
6195             } else {
6196                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6197             }
6198          }
6199
6200          if (pack_const && pack != Temp())
6201             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6202       }
6203       if (pack_const && pack == Temp())
6204          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6205       else if (pack == Temp())
6206          has_offset = false;
6207       else
6208          offset = pack;
6209    }
6210
6211    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6212       prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6213
6214    /* pack derivatives */
6215    if (has_ddx || has_ddy) {
6216       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6217          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6218                              ddx, Operand(0u), ddy, Operand(0u));
6219       } else {
6220          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6221       }
6222       has_derivs = true;
6223    }
6224
6225    if (instr->coord_components > 1 &&
6226        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6227        instr->is_array &&
6228        instr->op != nir_texop_txf)
6229       coords = apply_round_slice(ctx, coords, 1);
6230
6231    if (instr->coord_components > 2 &&
6232       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6233        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6234        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6235        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6236        instr->is_array &&
6237        instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6238       coords = apply_round_slice(ctx, coords, 2);
6239
6240    if (ctx->options->chip_class == GFX9 &&
6241        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6242        instr->op != nir_texop_lod && instr->coord_components) {
6243       assert(coords.size() > 0 && coords.size() < 3);
6244
6245       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6246       vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6247       vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6248       if (coords.size() > 1)
6249          vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6250       coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6251       vec->definitions[0] = Definition(coords);
6252       ctx->block->instructions.emplace_back(std::move(vec));
6253    }
6254
6255    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6256
6257    if (instr->op == nir_texop_samples_identical)
6258       resource = fmask_ptr;
6259
6260    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6261              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6262             instr->op != nir_texop_txs) {
6263       assert(has_sample_index);
6264       Operand op(sample_index);
6265       if (sample_index_cv)
6266          op = Operand(sample_index_cv->u32);
6267       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6268    }
6269
6270    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6271       Temp split_coords[coords.size()];
6272       emit_split_vector(ctx, coords, coords.size());
6273       for (unsigned i = 0; i < coords.size(); i++)
6274          split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6275
6276       unsigned i = 0;
6277       for (; i < std::min(offset.size(), instr->coord_components); i++) {
6278          Temp off = emit_extract_vector(ctx, offset, i, v1);
6279          split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6280       }
6281
6282       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6283       for (unsigned i = 0; i < coords.size(); i++)
6284          vec->operands[i] = Operand(split_coords[i]);
6285       coords = bld.tmp(coords.regClass());
6286       vec->definitions[0] = Definition(coords);
6287       ctx->block->instructions.emplace_back(std::move(vec));
6288
6289       has_offset = false;
6290    }
6291
6292    /* Build tex instruction */
6293    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6294    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6295                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6296                   : 0;
6297    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6298    Temp tmp_dst = dst;
6299
6300    /* gather4 selects the component by dmask and always returns vec4 */
6301    if (instr->op == nir_texop_tg4) {
6302       assert(instr->dest.ssa.num_components == 4);
6303       if (instr->is_shadow)
6304          dmask = 1;
6305       else
6306          dmask = 1 << instr->component;
6307       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6308          tmp_dst = bld.tmp(v4);
6309    } else if (instr->op == nir_texop_samples_identical) {
6310       tmp_dst = bld.tmp(v1);
6311    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6312       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6313    }
6314
6315    aco_ptr<MIMG_instruction> tex;
6316    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6317       if (!has_lod)
6318          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6319
6320       bool div_by_6 = instr->op == nir_texop_txs &&
6321                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6322                       instr->is_array &&
6323                       (dmask & (1 << 2));
6324       if (tmp_dst.id() == dst.id() && div_by_6)
6325          tmp_dst = bld.tmp(tmp_dst.regClass());
6326
6327       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6328       tex->operands[0] = Operand(as_vgpr(ctx,lod));
6329       tex->operands[1] = Operand(resource);
6330       if (ctx->options->chip_class == GFX9 &&
6331           instr->op == nir_texop_txs &&
6332           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6333           instr->is_array) {
6334          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6335       } else if (instr->op == nir_texop_query_levels) {
6336          tex->dmask = 1 << 3;
6337       } else {
6338          tex->dmask = dmask;
6339       }
6340       tex->da = da;
6341       tex->definitions[0] = Definition(tmp_dst);
6342       tex->dim = dim;
6343       tex->can_reorder = true;
6344       ctx->block->instructions.emplace_back(std::move(tex));
6345
6346       if (div_by_6) {
6347          /* divide 3rd value by 6 by multiplying with magic number */
6348          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6349          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6350          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6351          assert(instr->dest.ssa.num_components == 3);
6352          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6353          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6354                               emit_extract_vector(ctx, tmp_dst, 0, v1),
6355                               emit_extract_vector(ctx, tmp_dst, 1, v1),
6356                               by_6);
6357
6358       }
6359
6360       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6361       return;
6362    }
6363
6364    Temp tg4_compare_cube_wa64 = Temp();
6365
6366    if (tg4_integer_workarounds) {
6367       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6368       tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6369       tex->operands[1] = Operand(resource);
6370       tex->dim = dim;
6371       tex->dmask = 0x3;
6372       tex->da = da;
6373       Temp size = bld.tmp(v2);
6374       tex->definitions[0] = Definition(size);
6375       tex->can_reorder = true;
6376       ctx->block->instructions.emplace_back(std::move(tex));
6377       emit_split_vector(ctx, size, size.size());
6378
6379       Temp half_texel[2];
6380       for (unsigned i = 0; i < 2; i++) {
6381          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6382          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6383          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6384          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6385       }
6386
6387       Temp orig_coords[2] = {
6388          emit_extract_vector(ctx, coords, 0, v1),
6389          emit_extract_vector(ctx, coords, 1, v1)};
6390       Temp new_coords[2] = {
6391          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6392          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6393       };
6394
6395       if (tg4_integer_cube_workaround) {
6396          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6397          Temp desc[resource.size()];
6398          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6399                                                                            Format::PSEUDO, 1, resource.size())};
6400          split->operands[0] = Operand(resource);
6401          for (unsigned i = 0; i < resource.size(); i++) {
6402             desc[i] = bld.tmp(s1);
6403             split->definitions[i] = Definition(desc[i]);
6404          }
6405          ctx->block->instructions.emplace_back(std::move(split));
6406
6407          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6408          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6409                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6410
6411          Temp nfmt;
6412          if (stype == GLSL_TYPE_UINT) {
6413             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6414                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6415                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6416                             bld.scc(compare_cube_wa));
6417          } else {
6418             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6419                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6420                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6421                             bld.scc(compare_cube_wa));
6422          }
6423          tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true);
6424          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6425
6426          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6427                             Operand((uint32_t)C_008F14_NUM_FORMAT));
6428          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6429
6430          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6431                                                                          Format::PSEUDO, resource.size(), 1)};
6432          for (unsigned i = 0; i < resource.size(); i++)
6433             vec->operands[i] = Operand(desc[i]);
6434          resource = bld.tmp(resource.regClass());
6435          vec->definitions[0] = Definition(resource);
6436          ctx->block->instructions.emplace_back(std::move(vec));
6437
6438          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6439                                   new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6440          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6441                                   new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6442       }
6443
6444       if (coords.size() == 3) {
6445          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6446                              new_coords[0], new_coords[1],
6447                              emit_extract_vector(ctx, coords, 2, v1));
6448       } else {
6449          assert(coords.size() == 2);
6450          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6451                              new_coords[0], new_coords[1]);
6452       }
6453    }
6454
6455    if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6456        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6457        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6458       coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
6459
6460    std::vector<Operand> args;
6461    if (has_offset)
6462       args.emplace_back(Operand(offset));
6463    if (has_bias)
6464       args.emplace_back(Operand(bias));
6465    if (has_compare)
6466       args.emplace_back(Operand(compare));
6467    if (has_derivs)
6468       args.emplace_back(Operand(derivs));
6469    args.emplace_back(Operand(coords));
6470    if (has_sample_index)
6471       args.emplace_back(Operand(sample_index));
6472    if (has_lod)
6473       args.emplace_back(lod);
6474
6475    Operand arg;
6476    if (args.size() > 1) {
6477       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6478       unsigned size = 0;
6479       for (unsigned i = 0; i < args.size(); i++) {
6480          size += args[i].size();
6481          vec->operands[i] = args[i];
6482       }
6483       RegClass rc = RegClass(RegType::vgpr, size);
6484       Temp tmp = bld.tmp(rc);
6485       vec->definitions[0] = Definition(tmp);
6486       ctx->block->instructions.emplace_back(std::move(vec));
6487       arg = Operand(tmp);
6488    } else {
6489       assert(args[0].isTemp());
6490       arg = Operand(as_vgpr(ctx, args[0].getTemp()));
6491    }
6492
6493    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6494       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6495
6496       assert(coords.size() == 1);
6497       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6498       aco_opcode op;
6499       switch (last_bit) {
6500       case 1:
6501          op = aco_opcode::buffer_load_format_x; break;
6502       case 2:
6503          op = aco_opcode::buffer_load_format_xy; break;
6504       case 3:
6505          op = aco_opcode::buffer_load_format_xyz; break;
6506       case 4:
6507          op = aco_opcode::buffer_load_format_xyzw; break;
6508       default:
6509          unreachable("Tex instruction loads more than 4 components.");
6510       }
6511
6512       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6513       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6514          tmp_dst = dst;
6515       else
6516          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6517
6518       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6519       mubuf->operands[0] = Operand(coords);
6520       mubuf->operands[1] = Operand(resource);
6521       mubuf->operands[2] = Operand((uint32_t) 0);
6522       mubuf->definitions[0] = Definition(tmp_dst);
6523       mubuf->idxen = true;
6524       mubuf->can_reorder = true;
6525       ctx->block->instructions.emplace_back(std::move(mubuf));
6526
6527       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6528       return;
6529    }
6530
6531
6532    if (instr->op == nir_texop_txf ||
6533        instr->op == nir_texop_txf_ms ||
6534        instr->op == nir_texop_samples_identical) {
6535       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6536       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6537       tex->operands[0] = Operand(arg);
6538       tex->operands[1] = Operand(resource);
6539       tex->dim = dim;
6540       tex->dmask = dmask;
6541       tex->unrm = true;
6542       tex->da = da;
6543       tex->definitions[0] = Definition(tmp_dst);
6544       tex->can_reorder = true;
6545       ctx->block->instructions.emplace_back(std::move(tex));
6546
6547       if (instr->op == nir_texop_samples_identical) {
6548          assert(dmask == 1 && dst.regClass() == v1);
6549          assert(dst.id() != tmp_dst.id());
6550
6551          Temp tmp = bld.tmp(s2);
6552          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6553          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6554
6555       } else {
6556          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6557       }
6558       return;
6559    }
6560
6561    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6562    aco_opcode opcode = aco_opcode::image_sample;
6563    if (has_offset) { /* image_sample_*_o */
6564       if (has_compare) {
6565          opcode = aco_opcode::image_sample_c_o;
6566          if (has_derivs)
6567             opcode = aco_opcode::image_sample_c_d_o;
6568          if (has_bias)
6569             opcode = aco_opcode::image_sample_c_b_o;
6570          if (level_zero)
6571             opcode = aco_opcode::image_sample_c_lz_o;
6572          if (has_lod)
6573             opcode = aco_opcode::image_sample_c_l_o;
6574       } else {
6575          opcode = aco_opcode::image_sample_o;
6576          if (has_derivs)
6577             opcode = aco_opcode::image_sample_d_o;
6578          if (has_bias)
6579             opcode = aco_opcode::image_sample_b_o;
6580          if (level_zero)
6581             opcode = aco_opcode::image_sample_lz_o;
6582          if (has_lod)
6583             opcode = aco_opcode::image_sample_l_o;
6584       }
6585    } else { /* no offset */
6586       if (has_compare) {
6587          opcode = aco_opcode::image_sample_c;
6588          if (has_derivs)
6589             opcode = aco_opcode::image_sample_c_d;
6590          if (has_bias)
6591             opcode = aco_opcode::image_sample_c_b;
6592          if (level_zero)
6593             opcode = aco_opcode::image_sample_c_lz;
6594          if (has_lod)
6595             opcode = aco_opcode::image_sample_c_l;
6596       } else {
6597          opcode = aco_opcode::image_sample;
6598          if (has_derivs)
6599             opcode = aco_opcode::image_sample_d;
6600          if (has_bias)
6601             opcode = aco_opcode::image_sample_b;
6602          if (level_zero)
6603             opcode = aco_opcode::image_sample_lz;
6604          if (has_lod)
6605             opcode = aco_opcode::image_sample_l;
6606       }
6607    }
6608
6609    if (instr->op == nir_texop_tg4) {
6610       if (has_offset) {
6611          opcode = aco_opcode::image_gather4_lz_o;
6612          if (has_compare)
6613             opcode = aco_opcode::image_gather4_c_lz_o;
6614       } else {
6615          opcode = aco_opcode::image_gather4_lz;
6616          if (has_compare)
6617             opcode = aco_opcode::image_gather4_c_lz;
6618       }
6619    } else if (instr->op == nir_texop_lod) {
6620       opcode = aco_opcode::image_get_lod;
6621    }
6622
6623    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6624    tex->operands[0] = arg;
6625    tex->operands[1] = Operand(resource);
6626    tex->operands[2] = Operand(sampler);
6627    tex->dim = dim;
6628    tex->dmask = dmask;
6629    tex->da = da;
6630    tex->definitions[0] = Definition(tmp_dst);
6631    tex->can_reorder = true;
6632    ctx->block->instructions.emplace_back(std::move(tex));
6633
6634    if (tg4_integer_cube_workaround) {
6635       assert(tmp_dst.id() != dst.id());
6636       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6637
6638       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6639       Temp val[4];
6640       for (unsigned i = 0; i < dst.size(); i++) {
6641          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6642          Temp cvt_val;
6643          if (stype == GLSL_TYPE_UINT)
6644             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6645          else
6646             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6647          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6648       }
6649       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6650       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6651                            val[0], val[1], val[2], val[3]);
6652    }
6653    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6654    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6655
6656 }
6657
6658
6659 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6660 {
6661    Temp tmp = get_ssa_temp(ctx, ssa);
6662    if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6663       return Operand(tmp.regClass());
6664    else
6665       return Operand(tmp);
6666 }
6667
6668 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6669 {
6670    aco_ptr<Pseudo_instruction> phi;
6671    unsigned num_src = exec_list_length(&instr->srcs);
6672    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6673
6674    aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6675
6676    std::map<unsigned, nir_ssa_def*> phi_src;
6677    bool all_undef = true;
6678    nir_foreach_phi_src(src, instr) {
6679       phi_src[src->pred->index] = src->src.ssa;
6680       if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef)
6681          all_undef = false;
6682    }
6683    if (all_undef) {
6684       Builder bld(ctx->program, ctx->block);
6685       if (dst.regClass() == s1) {
6686          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6687       } else if (dst.regClass() == v1) {
6688          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6689       } else {
6690          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6691          for (unsigned i = 0; i < dst.size(); i++)
6692             vec->operands[i] = Operand(0u);
6693          vec->definitions[0] = Definition(dst);
6694          ctx->block->instructions.emplace_back(std::move(vec));
6695       }
6696       return;
6697    }
6698
6699    /* try to scalarize vector phis */
6700    if (dst.size() > 1) {
6701       // TODO: scalarize linear phis on divergent ifs
6702       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
6703       std::array<Temp, 4> new_vec;
6704       for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) {
6705          Operand src = get_phi_operand(ctx, pair.second);
6706          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) {
6707             can_scalarize = false;
6708             break;
6709          }
6710       }
6711       if (can_scalarize) {
6712          unsigned num_components = instr->dest.ssa.num_components;
6713          assert(dst.size() % num_components == 0);
6714          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
6715
6716          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
6717          for (unsigned k = 0; k < num_components; k++) {
6718             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1));
6719             std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6720             for (unsigned i = 0; i < num_src; i++) {
6721                Operand src = get_phi_operand(ctx, it->second);
6722                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
6723                ++it;
6724             }
6725             Temp phi_dst = {ctx->program->allocateId(), rc};
6726             phi->definitions[0] = Definition(phi_dst);
6727             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6728             new_vec[k] = phi_dst;
6729             vec->operands[k] = Operand(phi_dst);
6730          }
6731          vec->definitions[0] = Definition(dst);
6732          ctx->block->instructions.emplace_back(std::move(vec));
6733          ctx->allocated_vec.emplace(dst.id(), new_vec);
6734          return;
6735       }
6736    }
6737
6738    unsigned extra_src = 0;
6739    if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) &&
6740        ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) {
6741       extra_src++;
6742    }
6743
6744    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1));
6745
6746    /* if we have a linear phi on a divergent if, we know that one src is undef */
6747    if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) {
6748       assert(extra_src == 0);
6749       Block* block;
6750       /* we place the phi either in the invert-block or in the current block */
6751       if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) {
6752          assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef);
6753          Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]];
6754          block = &ctx->program->blocks[linear_else.linear_preds[0]];
6755          assert(block->kind & block_kind_invert);
6756          phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second);
6757       } else {
6758          assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef);
6759          block = ctx->block;
6760          phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second);
6761       }
6762       phi->operands[1] = Operand(dst.regClass());
6763       phi->definitions[0] = Definition(dst);
6764       block->instructions.emplace(block->instructions.begin(), std::move(phi));
6765       return;
6766    }
6767
6768    std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6769    for (unsigned i = 0; i < num_src; i++) {
6770       phi->operands[i] = get_phi_operand(ctx, it->second);
6771       ++it;
6772    }
6773    for (unsigned i = 0; i < extra_src; i++)
6774       phi->operands[num_src + i] = Operand(dst.regClass());
6775    phi->definitions[0] = Definition(dst);
6776    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6777 }
6778
6779
6780 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
6781 {
6782    Temp dst = get_ssa_temp(ctx, &instr->def);
6783
6784    assert(dst.type() == RegType::sgpr);
6785
6786    if (dst.size() == 1) {
6787       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
6788    } else {
6789       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6790       for (unsigned i = 0; i < dst.size(); i++)
6791          vec->operands[i] = Operand(0u);
6792       vec->definitions[0] = Definition(dst);
6793       ctx->block->instructions.emplace_back(std::move(vec));
6794    }
6795 }
6796
6797 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
6798 {
6799    Builder bld(ctx->program, ctx->block);
6800    Block *logical_target;
6801    append_logical_end(ctx->block);
6802    unsigned idx = ctx->block->index;
6803
6804    switch (instr->type) {
6805    case nir_jump_break:
6806       logical_target = ctx->cf_info.parent_loop.exit;
6807       add_logical_edge(idx, logical_target);
6808       ctx->block->kind |= block_kind_break;
6809
6810       if (!ctx->cf_info.parent_if.is_divergent &&
6811           !ctx->cf_info.parent_loop.has_divergent_continue) {
6812          /* uniform break - directly jump out of the loop */
6813          ctx->block->kind |= block_kind_uniform;
6814          ctx->cf_info.has_branch = true;
6815          bld.branch(aco_opcode::p_branch);
6816          add_linear_edge(idx, logical_target);
6817          return;
6818       }
6819       ctx->cf_info.parent_loop.has_divergent_branch = true;
6820       break;
6821    case nir_jump_continue:
6822       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6823       add_logical_edge(idx, logical_target);
6824       ctx->block->kind |= block_kind_continue;
6825
6826       if (ctx->cf_info.parent_if.is_divergent) {
6827          /* for potential uniform breaks after this continue,
6828             we must ensure that they are handled correctly */
6829          ctx->cf_info.parent_loop.has_divergent_continue = true;
6830          ctx->cf_info.parent_loop.has_divergent_branch = true;
6831       } else {
6832          /* uniform continue - directly jump to the loop header */
6833          ctx->block->kind |= block_kind_uniform;
6834          ctx->cf_info.has_branch = true;
6835          bld.branch(aco_opcode::p_branch);
6836          add_linear_edge(idx, logical_target);
6837          return;
6838       }
6839       break;
6840    default:
6841       fprintf(stderr, "Unknown NIR jump instr: ");
6842       nir_print_instr(&instr->instr, stderr);
6843       fprintf(stderr, "\n");
6844       abort();
6845    }
6846
6847    /* remove critical edges from linear CFG */
6848    bld.branch(aco_opcode::p_branch);
6849    Block* break_block = ctx->program->create_and_insert_block();
6850    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6851    break_block->kind |= block_kind_uniform;
6852    add_linear_edge(idx, break_block);
6853    /* the loop_header pointer might be invalidated by this point */
6854    if (instr->type == nir_jump_continue)
6855       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6856    add_linear_edge(break_block->index, logical_target);
6857    bld.reset(break_block);
6858    bld.branch(aco_opcode::p_branch);
6859
6860    Block* continue_block = ctx->program->create_and_insert_block();
6861    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6862    add_linear_edge(idx, continue_block);
6863    append_logical_start(continue_block);
6864    ctx->block = continue_block;
6865    return;
6866 }
6867
6868 void visit_block(isel_context *ctx, nir_block *block)
6869 {
6870    nir_foreach_instr(instr, block) {
6871       switch (instr->type) {
6872       case nir_instr_type_alu:
6873          visit_alu_instr(ctx, nir_instr_as_alu(instr));
6874          break;
6875       case nir_instr_type_load_const:
6876          visit_load_const(ctx, nir_instr_as_load_const(instr));
6877          break;
6878       case nir_instr_type_intrinsic:
6879          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
6880          break;
6881       case nir_instr_type_tex:
6882          visit_tex(ctx, nir_instr_as_tex(instr));
6883          break;
6884       case nir_instr_type_phi:
6885          visit_phi(ctx, nir_instr_as_phi(instr));
6886          break;
6887       case nir_instr_type_ssa_undef:
6888          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
6889          break;
6890       case nir_instr_type_deref:
6891          break;
6892       case nir_instr_type_jump:
6893          visit_jump(ctx, nir_instr_as_jump(instr));
6894          break;
6895       default:
6896          fprintf(stderr, "Unknown NIR instr type: ");
6897          nir_print_instr(instr, stderr);
6898          fprintf(stderr, "\n");
6899          //abort();
6900       }
6901    }
6902 }
6903
6904
6905
6906 static void visit_loop(isel_context *ctx, nir_loop *loop)
6907 {
6908    append_logical_end(ctx->block);
6909    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
6910    Builder bld(ctx->program, ctx->block);
6911    bld.branch(aco_opcode::p_branch);
6912    unsigned loop_preheader_idx = ctx->block->index;
6913
6914    Block loop_exit = Block();
6915    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
6916    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
6917
6918    Block* loop_header = ctx->program->create_and_insert_block();
6919    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
6920    loop_header->kind |= block_kind_loop_header;
6921    add_edge(loop_preheader_idx, loop_header);
6922    ctx->block = loop_header;
6923
6924    /* emit loop body */
6925    unsigned loop_header_idx = loop_header->index;
6926    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
6927    append_logical_start(ctx->block);
6928    visit_cf_list(ctx, &loop->body);
6929
6930    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
6931    if (!ctx->cf_info.has_branch) {
6932       append_logical_end(ctx->block);
6933       if (ctx->cf_info.exec_potentially_empty) {
6934          /* Discards can result in code running with an empty exec mask.
6935           * This would result in divergent breaks not ever being taken. As a
6936           * workaround, break the loop when the loop mask is empty instead of
6937           * always continuing. */
6938          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
6939
6940          /* create "loop_almost_exit" to avoid critical edges */
6941          unsigned block_idx = ctx->block->index;
6942          Block *loop_almost_exit = ctx->program->create_and_insert_block();
6943          loop_almost_exit->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6944          loop_almost_exit->kind = block_kind_uniform;
6945          bld.reset(loop_almost_exit);
6946          bld.branch(aco_opcode::p_branch);
6947
6948          add_linear_edge(block_idx, loop_almost_exit);
6949          add_linear_edge(loop_almost_exit->index, &loop_exit);
6950
6951          ctx->block = &ctx->program->blocks[block_idx];
6952       } else {
6953          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
6954       }
6955       if (!ctx->cf_info.parent_loop.has_divergent_branch)
6956          add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6957       else
6958          add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6959       bld.reset(ctx->block);
6960       bld.branch(aco_opcode::p_branch);
6961    }
6962
6963    /* fixup phis in loop header from unreachable blocks */
6964    if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
6965       bool linear = ctx->cf_info.has_branch;
6966       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
6967       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
6968          if ((logical && instr->opcode == aco_opcode::p_phi) ||
6969              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
6970             /* the last operand should be the one that needs to be removed */
6971             instr->operands.pop_back();
6972          } else if (!is_phi(instr)) {
6973             break;
6974          }
6975       }
6976    }
6977
6978    ctx->cf_info.has_branch = false;
6979
6980    // TODO: if the loop has not a single exit, we must add one °°
6981    /* emit loop successor block */
6982    ctx->block = ctx->program->insert_block(std::move(loop_exit));
6983    append_logical_start(ctx->block);
6984
6985    #if 0
6986    // TODO: check if it is beneficial to not branch on continues
6987    /* trim linear phis in loop header */
6988    for (auto&& instr : loop_entry->instructions) {
6989       if (instr->opcode == aco_opcode::p_linear_phi) {
6990          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
6991          new_phi->definitions[0] = instr->definitions[0];
6992          for (unsigned i = 0; i < new_phi->operands.size(); i++)
6993             new_phi->operands[i] = instr->operands[i];
6994          /* check that the remaining operands are all the same */
6995          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
6996             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
6997          instr.swap(new_phi);
6998       } else if (instr->opcode == aco_opcode::p_phi) {
6999          continue;
7000       } else {
7001          break;
7002       }
7003    }
7004    #endif
7005 }
7006
7007 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7008 {
7009    ic->cond = cond;
7010
7011    append_logical_end(ctx->block);
7012    ctx->block->kind |= block_kind_branch;
7013
7014    /* branch to linear then block */
7015    assert(cond.regClass() == s2);
7016    aco_ptr<Pseudo_branch_instruction> branch;
7017    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7018    branch->operands[0] = Operand(cond);
7019    ctx->block->instructions.push_back(std::move(branch));
7020
7021    ic->BB_if_idx = ctx->block->index;
7022    ic->BB_invert = Block();
7023    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7024    /* Invert blocks are intentionally not marked as top level because they
7025     * are not part of the logical cfg. */
7026    ic->BB_invert.kind |= block_kind_invert;
7027    ic->BB_endif = Block();
7028    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7029    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7030
7031    ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7032    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7033    ctx->cf_info.parent_if.is_divergent = true;
7034    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7035
7036    /** emit logical then block */
7037    Block* BB_then_logical = ctx->program->create_and_insert_block();
7038    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7039    add_edge(ic->BB_if_idx, BB_then_logical);
7040    ctx->block = BB_then_logical;
7041    append_logical_start(BB_then_logical);
7042 }
7043
7044 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7045 {
7046    Block *BB_then_logical = ctx->block;
7047    append_logical_end(BB_then_logical);
7048     /* branch from logical then block to invert block */
7049    aco_ptr<Pseudo_branch_instruction> branch;
7050    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7051    BB_then_logical->instructions.emplace_back(std::move(branch));
7052    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7053    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7054       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7055    BB_then_logical->kind |= block_kind_uniform;
7056    assert(!ctx->cf_info.has_branch);
7057    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7058    ctx->cf_info.parent_loop.has_divergent_branch = false;
7059
7060    /** emit linear then block */
7061    Block* BB_then_linear = ctx->program->create_and_insert_block();
7062    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7063    BB_then_linear->kind |= block_kind_uniform;
7064    add_linear_edge(ic->BB_if_idx, BB_then_linear);
7065    /* branch from linear then block to invert block */
7066    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7067    BB_then_linear->instructions.emplace_back(std::move(branch));
7068    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7069
7070    /** emit invert merge block */
7071    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7072    ic->invert_idx = ctx->block->index;
7073
7074    /* branch to linear else block (skip else) */
7075    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7076    branch->operands[0] = Operand(ic->cond);
7077    ctx->block->instructions.push_back(std::move(branch));
7078
7079    ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7080    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7081
7082    /** emit logical else block */
7083    Block* BB_else_logical = ctx->program->create_and_insert_block();
7084    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7085    add_logical_edge(ic->BB_if_idx, BB_else_logical);
7086    add_linear_edge(ic->invert_idx, BB_else_logical);
7087    ctx->block = BB_else_logical;
7088    append_logical_start(BB_else_logical);
7089 }
7090
7091 static void end_divergent_if(isel_context *ctx, if_context *ic)
7092 {
7093    Block *BB_else_logical = ctx->block;
7094    append_logical_end(BB_else_logical);
7095
7096    /* branch from logical else block to endif block */
7097    aco_ptr<Pseudo_branch_instruction> branch;
7098    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7099    BB_else_logical->instructions.emplace_back(std::move(branch));
7100    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7101    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7102       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7103    BB_else_logical->kind |= block_kind_uniform;
7104
7105    assert(!ctx->cf_info.has_branch);
7106    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7107
7108
7109    /** emit linear else block */
7110    Block* BB_else_linear = ctx->program->create_and_insert_block();
7111    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7112    BB_else_linear->kind |= block_kind_uniform;
7113    add_linear_edge(ic->invert_idx, BB_else_linear);
7114
7115    /* branch from linear else block to endif block */
7116    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7117    BB_else_linear->instructions.emplace_back(std::move(branch));
7118    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7119
7120
7121    /** emit endif merge block */
7122    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7123    append_logical_start(ctx->block);
7124
7125
7126    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7127    ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7128    /* uniform control flow never has an empty exec-mask */
7129    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7130       ctx->cf_info.exec_potentially_empty = false;
7131 }
7132
7133 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7134 {
7135    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7136    Builder bld(ctx->program, ctx->block);
7137    aco_ptr<Pseudo_branch_instruction> branch;
7138
7139    if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7140       /**
7141        * Uniform conditionals are represented in the following way*) :
7142        *
7143        * The linear and logical CFG:
7144        *                        BB_IF
7145        *                        /    \
7146        *       BB_THEN (logical)      BB_ELSE (logical)
7147        *                        \    /
7148        *                        BB_ENDIF
7149        *
7150        * *) Exceptions may be due to break and continue statements within loops
7151        *    If a break/continue happens within uniform control flow, it branches
7152        *    to the loop exit/entry block. Otherwise, it branches to the next
7153        *    merge block.
7154        **/
7155       append_logical_end(ctx->block);
7156       ctx->block->kind |= block_kind_uniform;
7157
7158       /* emit branch */
7159       if (cond.regClass() == s2) {
7160          // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7161          cond = as_uniform_bool(ctx, cond);
7162       }
7163       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7164       branch->operands[0] = Operand(cond);
7165       branch->operands[0].setFixed(scc);
7166       ctx->block->instructions.emplace_back(std::move(branch));
7167
7168       unsigned BB_if_idx = ctx->block->index;
7169       Block BB_endif = Block();
7170       BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7171       BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7172
7173       /** emit then block */
7174       Block* BB_then = ctx->program->create_and_insert_block();
7175       BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7176       add_edge(BB_if_idx, BB_then);
7177       append_logical_start(BB_then);
7178       ctx->block = BB_then;
7179       visit_cf_list(ctx, &if_stmt->then_list);
7180       BB_then = ctx->block;
7181       bool then_branch = ctx->cf_info.has_branch;
7182       bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7183
7184       if (!then_branch) {
7185          append_logical_end(BB_then);
7186          /* branch from then block to endif block */
7187          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7188          BB_then->instructions.emplace_back(std::move(branch));
7189          add_linear_edge(BB_then->index, &BB_endif);
7190          if (!then_branch_divergent)
7191             add_logical_edge(BB_then->index, &BB_endif);
7192          BB_then->kind |= block_kind_uniform;
7193       }
7194
7195       ctx->cf_info.has_branch = false;
7196       ctx->cf_info.parent_loop.has_divergent_branch = false;
7197
7198       /** emit else block */
7199       Block* BB_else = ctx->program->create_and_insert_block();
7200       BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7201       add_edge(BB_if_idx, BB_else);
7202       append_logical_start(BB_else);
7203       ctx->block = BB_else;
7204       visit_cf_list(ctx, &if_stmt->else_list);
7205       BB_else = ctx->block;
7206
7207       if (!ctx->cf_info.has_branch) {
7208          append_logical_end(BB_else);
7209          /* branch from then block to endif block */
7210          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7211          BB_else->instructions.emplace_back(std::move(branch));
7212          add_linear_edge(BB_else->index, &BB_endif);
7213          if (!ctx->cf_info.parent_loop.has_divergent_branch)
7214             add_logical_edge(BB_else->index, &BB_endif);
7215          BB_else->kind |= block_kind_uniform;
7216       }
7217
7218       ctx->cf_info.has_branch &= then_branch;
7219       ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7220
7221       /** emit endif merge block */
7222       if (!ctx->cf_info.has_branch) {
7223          ctx->block = ctx->program->insert_block(std::move(BB_endif));
7224          append_logical_start(ctx->block);
7225       }
7226    } else { /* non-uniform condition */
7227       /**
7228        * To maintain a logical and linear CFG without critical edges,
7229        * non-uniform conditionals are represented in the following way*) :
7230        *
7231        * The linear CFG:
7232        *                        BB_IF
7233        *                        /    \
7234        *       BB_THEN (logical)      BB_THEN (linear)
7235        *                        \    /
7236        *                        BB_INVERT (linear)
7237        *                        /    \
7238        *       BB_ELSE (logical)      BB_ELSE (linear)
7239        *                        \    /
7240        *                        BB_ENDIF
7241        *
7242        * The logical CFG:
7243        *                        BB_IF
7244        *                        /    \
7245        *       BB_THEN (logical)      BB_ELSE (logical)
7246        *                        \    /
7247        *                        BB_ENDIF
7248        *
7249        * *) Exceptions may be due to break and continue statements within loops
7250        **/
7251
7252       if_context ic;
7253
7254       begin_divergent_if_then(ctx, &ic, cond);
7255       visit_cf_list(ctx, &if_stmt->then_list);
7256
7257       begin_divergent_if_else(ctx, &ic);
7258       visit_cf_list(ctx, &if_stmt->else_list);
7259
7260       end_divergent_if(ctx, &ic);
7261    }
7262 }
7263
7264 static void visit_cf_list(isel_context *ctx,
7265                           struct exec_list *list)
7266 {
7267    foreach_list_typed(nir_cf_node, node, node, list) {
7268       switch (node->type) {
7269       case nir_cf_node_block:
7270          visit_block(ctx, nir_cf_node_as_block(node));
7271          break;
7272       case nir_cf_node_if:
7273          visit_if(ctx, nir_cf_node_as_if(node));
7274          break;
7275       case nir_cf_node_loop:
7276          visit_loop(ctx, nir_cf_node_as_loop(node));
7277          break;
7278       default:
7279          unreachable("unimplemented cf list type");
7280       }
7281    }
7282 }
7283
7284 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7285 {
7286    int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7287    uint64_t mask = ctx->vs_output.mask[slot];
7288    if (!is_pos && !mask)
7289       return;
7290    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7291       return;
7292    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7293    exp->enabled_mask = mask;
7294    for (unsigned i = 0; i < 4; ++i) {
7295       if (mask & (1 << i))
7296          exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7297       else
7298          exp->operands[i] = Operand(v1);
7299    }
7300    exp->valid_mask = false;
7301    exp->done = false;
7302    exp->compressed = false;
7303    if (is_pos)
7304       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7305    else
7306       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7307    ctx->block->instructions.emplace_back(std::move(exp));
7308 }
7309
7310 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7311 {
7312    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7313    exp->enabled_mask = 0;
7314    for (unsigned i = 0; i < 4; ++i)
7315       exp->operands[i] = Operand(v1);
7316    if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7317       exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7318       exp->enabled_mask |= 0x1;
7319    }
7320    if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7321       exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7322       exp->enabled_mask |= 0x4;
7323    }
7324    if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7325       if (ctx->options->chip_class < GFX9) {
7326          exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7327          exp->enabled_mask |= 0x8;
7328       } else {
7329          Builder bld(ctx->program, ctx->block);
7330
7331          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7332                              Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7333          if (exp->operands[2].isTemp())
7334             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7335
7336          exp->operands[2] = Operand(out);
7337          exp->enabled_mask |= 0x4;
7338       }
7339    }
7340    exp->valid_mask = false;
7341    exp->done = false;
7342    exp->compressed = false;
7343    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7344    ctx->block->instructions.emplace_back(std::move(exp));
7345 }
7346
7347 static void create_vs_exports(isel_context *ctx)
7348 {
7349    radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7350
7351    if (outinfo->export_prim_id) {
7352       ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7353       ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id;
7354    }
7355
7356    if (ctx->options->key.has_multiview_view_index) {
7357       ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7358       ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index);
7359    }
7360
7361    /* the order these position exports are created is important */
7362    int next_pos = 0;
7363    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7364    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7365       export_vs_psiz_layer_viewport(ctx, &next_pos);
7366    }
7367    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7368       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7369    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7370       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7371
7372    if (ctx->options->key.vs_common_out.export_clip_dists) {
7373       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7374          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7375       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7376          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7377    }
7378
7379    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7380       if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7381           i != VARYING_SLOT_PRIMITIVE_ID)
7382          continue;
7383
7384       export_vs_varying(ctx, i, false, NULL);
7385    }
7386 }
7387
7388 static void emit_stream_output(isel_context *ctx,
7389                                Temp const *so_buffers,
7390                                Temp const *so_write_offset,
7391                                const struct radv_stream_output *output)
7392 {
7393    unsigned num_comps = util_bitcount(output->component_mask);
7394    unsigned loc = output->location;
7395    unsigned buf = output->buffer;
7396    unsigned offset = output->offset;
7397
7398    assert(num_comps && num_comps <= 4);
7399    if (!num_comps || num_comps > 4)
7400       return;
7401
7402    unsigned start = ffs(output->component_mask) - 1;
7403
7404    Temp out[4];
7405    bool all_undef = true;
7406    assert(ctx->stage == vertex_vs);
7407    for (unsigned i = 0; i < num_comps; i++) {
7408       out[i] = ctx->vs_output.outputs[loc][start + i];
7409       all_undef = all_undef && !out[i].id();
7410    }
7411    if (all_undef)
7412       return;
7413
7414    Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7415    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7416    for (unsigned i = 0; i < num_comps; ++i)
7417       vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7418    vec->definitions[0] = Definition(write_data);
7419    ctx->block->instructions.emplace_back(std::move(vec));
7420
7421    aco_opcode opcode;
7422    switch (num_comps) {
7423    case 1:
7424       opcode = aco_opcode::buffer_store_dword;
7425       break;
7426    case 2:
7427       opcode = aco_opcode::buffer_store_dwordx2;
7428       break;
7429    case 3:
7430       opcode = aco_opcode::buffer_store_dwordx3;
7431       break;
7432    case 4:
7433       opcode = aco_opcode::buffer_store_dwordx4;
7434       break;
7435    }
7436
7437    aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7438    store->operands[0] = Operand(so_write_offset[buf]);
7439    store->operands[1] = Operand(so_buffers[buf]);
7440    store->operands[2] = Operand((uint32_t) 0);
7441    store->operands[3] = Operand(write_data);
7442    if (offset > 4095) {
7443       /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7444       Builder bld(ctx->program, ctx->block);
7445       store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7446    } else {
7447       store->offset = offset;
7448    }
7449    store->offen = true;
7450    store->glc = true;
7451    store->dlc = false;
7452    store->slc = true;
7453    store->can_reorder = true;
7454    ctx->block->instructions.emplace_back(std::move(store));
7455 }
7456
7457 static void emit_streamout(isel_context *ctx, unsigned stream)
7458 {
7459    Builder bld(ctx->program, ctx->block);
7460
7461    Temp so_buffers[4];
7462    Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers);
7463    for (unsigned i = 0; i < 4; i++) {
7464       unsigned stride = ctx->program->info->so.strides[i];
7465       if (!stride)
7466          continue;
7467
7468       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7469    }
7470
7471    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7472                                 ctx->streamout_config, Operand(0x70010u));
7473
7474    Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7475                        bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7476
7477    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7478
7479    if_context ic;
7480    begin_divergent_if_then(ctx, &ic, can_emit);
7481
7482    bld.reset(ctx->block);
7483
7484    Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid);
7485
7486    Temp so_write_offset[4];
7487
7488    for (unsigned i = 0; i < 4; i++) {
7489       unsigned stride = ctx->program->info->so.strides[i];
7490       if (!stride)
7491          continue;
7492
7493       if (stride == 1) {
7494          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7495                                 ctx->streamout_write_idx, ctx->streamout_offset[i]);
7496          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7497
7498          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7499       } else {
7500          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7501          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]);
7502          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7503       }
7504    }
7505
7506    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7507       struct radv_stream_output *output =
7508          &ctx->program->info->so.outputs[i];
7509       if (stream != output->stream)
7510          continue;
7511
7512       emit_stream_output(ctx, so_buffers, so_write_offset, output);
7513    }
7514
7515    begin_divergent_if_else(ctx, &ic);
7516    end_divergent_if(ctx, &ic);
7517 }
7518
7519 } /* end namespace */
7520
7521 void handle_bc_optimize(isel_context *ctx)
7522 {
7523    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7524    Builder bld(ctx->program, ctx->block);
7525    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7526    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7527    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7528    if (uses_center && uses_centroid) {
7529       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u));
7530
7531       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7532          for (unsigned i = 0; i < 2; i++) {
7533             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7534                                       ctx->fs_inputs[fs_input::persp_centroid_p1 + i],
7535                                       ctx->fs_inputs[fs_input::persp_center_p1 + i],
7536                                       sel);
7537             ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord;
7538          }
7539       }
7540
7541       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7542          for (unsigned i = 0; i < 2; i++) {
7543             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7544                                       ctx->fs_inputs[fs_input::linear_centroid_p1 + i],
7545                                       ctx->fs_inputs[fs_input::linear_center_p1 + i],
7546                                       sel);
7547             ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord;
7548          }
7549       }
7550    }
7551 }
7552
7553 void select_program(Program *program,
7554                     unsigned shader_count,
7555                     struct nir_shader *const *shaders,
7556                     ac_shader_config* config,
7557                     struct radv_shader_info *info,
7558                     struct radv_nir_compiler_options *options)
7559 {
7560    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options);
7561
7562    for (unsigned i = 0; i < shader_count; i++) {
7563       nir_shader *nir = shaders[i];
7564       init_context(&ctx, nir);
7565
7566       if (!i) {
7567          add_startpgm(&ctx); /* needs to be after init_context() for FS */
7568          append_logical_start(ctx.block);
7569       }
7570
7571       if_context ic;
7572       if (shader_count >= 2) {
7573          Builder bld(ctx.program, ctx.block);
7574          Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7575          Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7576                                    bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7577          Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7578
7579          begin_divergent_if_then(&ctx, &ic, cond);
7580       }
7581
7582       if (i) {
7583          Builder bld(ctx.program, ctx.block);
7584          bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7585          bld.sopp(aco_opcode::s_barrier);
7586       }
7587
7588       if (ctx.stage == fragment_fs)
7589          handle_bc_optimize(&ctx);
7590
7591       nir_function_impl *func = nir_shader_get_entrypoint(nir);
7592       visit_cf_list(&ctx, &func->body);
7593
7594       if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7595          emit_streamout(&ctx, 0);
7596
7597       if (ctx.stage == vertex_vs)
7598          create_vs_exports(&ctx);
7599
7600       if (shader_count >= 2) {
7601          begin_divergent_if_else(&ctx, &ic);
7602          end_divergent_if(&ctx, &ic);
7603       }
7604
7605       ralloc_free(ctx.divergent_vals);
7606    }
7607
7608    append_logical_end(ctx.block);
7609    ctx.block->kind |= block_kind_uniform;
7610    Builder bld(ctx.program, ctx.block);
7611    if (ctx.program->wb_smem_l1_on_end)
7612       bld.smem(aco_opcode::s_dcache_wb, false);
7613    bld.sopp(aco_opcode::s_endpgm);
7614
7615    /* cleanup CFG */
7616    for (Block& BB : program->blocks) {
7617       for (unsigned idx : BB.linear_preds)
7618          program->blocks[idx].linear_succs.emplace_back(BB.index);
7619       for (unsigned idx : BB.logical_preds)
7620          program->blocks[idx].logical_succs.emplace_back(BB.index);
7621    }
7622 }
7623 }