src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <array>
  28 #include <map>
  29
  30 #include "ac_shader_util.h"
  31 #include "aco_ir.h"
  32 #include "aco_builder.h"
  33 #include "aco_interface.h"
  34 #include "aco_instruction_selection_setup.cpp"
  35 #include "util/fast_idiv_by_const.h"
  36
  37 namespace aco {
  38 namespace {
  39
  40 class loop_info_RAII {
  41    isel_context* ctx;
  42    unsigned header_idx_old;
  43    Block* exit_old;
  44    bool divergent_cont_old;
  45    bool divergent_branch_old;
  46    bool divergent_if_old;
  47
  48 public:
  49    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  50       : ctx(ctx),
  51         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  52         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  53         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  54         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  55    {
  56       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  57       ctx->cf_info.parent_loop.exit = loop_exit;
  58       ctx->cf_info.parent_loop.has_divergent_continue = false;
  59       ctx->cf_info.parent_loop.has_divergent_branch = false;
  60       ctx->cf_info.parent_if.is_divergent = false;
  61       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  62    }
  63
  64    ~loop_info_RAII()
  65    {
  66       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  67       ctx->cf_info.parent_loop.exit = exit_old;
  68       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  69       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  70       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  71       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  72       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  73          ctx->cf_info.exec_potentially_empty = false;
  74    }
  75 };
  76
  77 struct if_context {
  78    Temp cond;
  79
  80    bool divergent_old;
  81    bool exec_potentially_empty_old;
  82
  83    unsigned BB_if_idx;
  84    unsigned invert_idx;
  85    bool then_branch_divergent;
  86    Block BB_invert;
  87    Block BB_endif;
  88 };
  89
  90 static void visit_cf_list(struct isel_context *ctx,
  91                           struct exec_list *list);
  92
  93 static void add_logical_edge(unsigned pred_idx, Block *succ)
  94 {
  95    succ->logical_preds.emplace_back(pred_idx);
  96 }
  97
  98
  99 static void add_linear_edge(unsigned pred_idx, Block *succ)
 100 {
 101    succ->linear_preds.emplace_back(pred_idx);
 102 }
 103
 104 static void add_edge(unsigned pred_idx, Block *succ)
 105 {
 106    add_logical_edge(pred_idx, succ);
 107    add_linear_edge(pred_idx, succ);
 108 }
 109
 110 static void append_logical_start(Block *b)
 111 {
 112    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 113 }
 114
 115 static void append_logical_end(Block *b)
 116 {
 117    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 118 }
 119
 120 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 121 {
 122    assert(ctx->allocated[def->index].id());
 123    return ctx->allocated[def->index];
 124 }
 125
 126 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 127 {
 128    Builder bld(ctx->program, ctx->block);
 129
 130    if (!dst.id())
 131       dst = bld.tmp(src.regClass());
 132
 133    if (ctx->stage != fragment_fs) {
 134       if (!dst.id())
 135          return src;
 136
 137       if (src.type() == RegType::vgpr || src.size() > 1)
 138          bld.copy(Definition(dst), src);
 139       else
 140          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 141       return dst;
 142    }
 143
 144    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 145    ctx->program->needs_wqm |= program_needs_wqm;
 146    return dst;
 147 }
 148
 149 Temp as_vgpr(isel_context *ctx, Temp val)
 150 {
 151    if (val.type() == RegType::sgpr) {
 152       Builder bld(ctx->program, ctx->block);
 153       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 154    }
 155    assert(val.type() == RegType::vgpr);
 156    return val;
 157 }
 158
 159 //assumes a != 0xffffffff
 160 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 161 {
 162    assert(b != 0);
 163    Builder bld(ctx->program, ctx->block);
 164
 165    if (util_is_power_of_two_or_zero(b)) {
 166       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 167       return;
 168    }
 169
 170    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 171
 172    assert(info.multiplier <= 0xffffffff);
 173
 174    bool pre_shift = info.pre_shift != 0;
 175    bool increment = info.increment != 0;
 176    bool multiply = true;
 177    bool post_shift = info.post_shift != 0;
 178
 179    if (!pre_shift && !increment && !multiply && !post_shift) {
 180       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 181       return;
 182    }
 183
 184    Temp pre_shift_dst = a;
 185    if (pre_shift) {
 186       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 187       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 188    }
 189
 190    Temp increment_dst = pre_shift_dst;
 191    if (increment) {
 192       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 193       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 194    }
 195
 196    Temp multiply_dst = increment_dst;
 197    if (multiply) {
 198       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 199       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 200                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 201    }
 202
 203    if (post_shift) {
 204       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 205    }
 206 }
 207
 208 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 209 {
 210    Builder bld(ctx->program, ctx->block);
 211    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 212 }
 213
 214
 215 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 216 {
 217    /* no need to extract the whole vector */
 218    if (src.regClass() == dst_rc) {
 219       assert(idx == 0);
 220       return src;
 221    }
 222    assert(src.size() > idx);
 223    Builder bld(ctx->program, ctx->block);
 224    auto it = ctx->allocated_vec.find(src.id());
 225    /* the size check needs to be early because elements other than 0 may be garbage */
 226    if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
 227       if (it->second[idx].regClass() == dst_rc) {
 228          return it->second[idx];
 229       } else {
 230          assert(dst_rc.size() == it->second[idx].regClass().size());
 231          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 232          return bld.copy(bld.def(dst_rc), it->second[idx]);
 233       }
 234    }
 235
 236    if (src.size() == dst_rc.size()) {
 237       assert(idx == 0);
 238       return bld.copy(bld.def(dst_rc), src);
 239    } else {
 240       Temp dst = bld.tmp(dst_rc);
 241       emit_extract_vector(ctx, src, idx, dst);
 242       return dst;
 243    }
 244 }
 245
 246 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 247 {
 248    if (num_components == 1)
 249       return;
 250    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 251       return;
 252    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 253    split->operands[0] = Operand(vec_src);
 254    std::array<Temp,4> elems;
 255    for (unsigned i = 0; i < num_components; i++) {
 256       elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
 257       split->definitions[i] = Definition(elems[i]);
 258    }
 259    ctx->block->instructions.emplace_back(std::move(split));
 260    ctx->allocated_vec.emplace(vec_src.id(), elems);
 261 }
 262
 263 /* This vector expansion uses a mask to determine which elements in the new vector
 264  * come from the original vector. The other elements are undefined. */
 265 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 266 {
 267    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 268
 269    if (vec_src == dst)
 270       return;
 271
 272    Builder bld(ctx->program, ctx->block);
 273    if (num_components == 1) {
 274       if (dst.type() == RegType::sgpr)
 275          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 276       else
 277          bld.copy(Definition(dst), vec_src);
 278       return;
 279    }
 280
 281    unsigned component_size = dst.size() / num_components;
 282    std::array<Temp,4> elems;
 283
 284    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 285    vec->definitions[0] = Definition(dst);
 286    unsigned k = 0;
 287    for (unsigned i = 0; i < num_components; i++) {
 288       if (mask & (1 << i)) {
 289          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 290          if (dst.type() == RegType::sgpr)
 291             src = bld.as_uniform(src);
 292          vec->operands[i] = Operand(src);
 293       } else {
 294          vec->operands[i] = Operand(0u);
 295       }
 296       elems[i] = vec->operands[i].getTemp();
 297    }
 298    ctx->block->instructions.emplace_back(std::move(vec));
 299    ctx->allocated_vec.emplace(dst.id(), elems);
 300 }
 301
 302 Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
 303 {
 304    if (val.regClass() == s2) {
 305       return val;
 306    } else {
 307       assert(val.regClass() == s1);
 308       Builder bld(ctx->program, ctx->block);
 309       Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
 310                                  Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
 311       if (vcc_hint)
 312          def.setHint(vcc);
 313       return def.getTemp();
 314    }
 315 }
 316
 317 Temp as_uniform_bool(isel_context *ctx, Temp val)
 318 {
 319    if (val.regClass() == s1) {
 320       return val;
 321    } else {
 322       assert(val.regClass() == s2);
 323       Builder bld(ctx->program, ctx->block);
 324       return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), Operand(val));
 325    }
 326 }
 327
 328 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 329 {
 330    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 331       return get_ssa_temp(ctx, src.src.ssa);
 332
 333    if (src.src.ssa->num_components == size) {
 334       bool identity_swizzle = true;
 335       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 336          if (src.swizzle[i] != i)
 337             identity_swizzle = false;
 338       }
 339       if (identity_swizzle)
 340          return get_ssa_temp(ctx, src.src.ssa);
 341    }
 342
 343    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 344    unsigned elem_size = vec.size() / src.src.ssa->num_components;
 345    assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
 346    assert(vec.size() % elem_size == 0);
 347
 348    RegClass elem_rc = RegClass(vec.type(), elem_size);
 349    if (size == 1) {
 350       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 351    } else {
 352       assert(size <= 4);
 353       std::array<Temp,4> elems;
 354       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 355       for (unsigned i = 0; i < size; ++i) {
 356          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 357          vec_instr->operands[i] = Operand{elems[i]};
 358       }
 359       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
 360       vec_instr->definitions[0] = Definition(dst);
 361       ctx->block->instructions.emplace_back(std::move(vec_instr));
 362       ctx->allocated_vec.emplace(dst.id(), elems);
 363       return dst;
 364    }
 365 }
 366
 367 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 368 {
 369    if (ptr.size() == 2)
 370       return ptr;
 371    Builder bld(ctx->program, ctx->block);
 372    if (ptr.type() == RegType::vgpr)
 373       ptr = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), ptr);
 374    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 375                      ptr, Operand((unsigned)ctx->options->address32_hi));
 376 }
 377
 378 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 379 {
 380    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 381    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 382    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 383    sop2->definitions[0] = Definition(dst);
 384    if (writes_scc)
 385       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 386    ctx->block->instructions.emplace_back(std::move(sop2));
 387 }
 388
 389 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
 390 {
 391    Builder bld(ctx->program, ctx->block);
 392    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 393    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 394    if (src1.type() == RegType::sgpr) {
 395       if (commutative && src0.type() == RegType::vgpr) {
 396          Temp t = src0;
 397          src0 = src1;
 398          src1 = t;
 399       } else if (src0.type() == RegType::vgpr &&
 400                  op != aco_opcode::v_madmk_f32 &&
 401                  op != aco_opcode::v_madak_f32 &&
 402                  op != aco_opcode::v_madmk_f16 &&
 403                  op != aco_opcode::v_madak_f16) {
 404          /* If the instruction is not commutative, we emit a VOP3A instruction */
 405          bld.vop2_e64(op, Definition(dst), src0, src1);
 406          return;
 407       } else {
 408          src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
 409       }
 410    }
 411    bld.vop2(op, Definition(dst), src0, src1);
 412 }
 413
 414 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 415 {
 416    Temp src0 = get_alu_src(ctx, instr->src[0]);
 417    Temp src1 = get_alu_src(ctx, instr->src[1]);
 418    Temp src2 = get_alu_src(ctx, instr->src[2]);
 419
 420    /* ensure that the instruction has at most 1 sgpr operand
 421     * The optimizer will inline constants for us */
 422    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 423       src0 = as_vgpr(ctx, src0);
 424    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 425       src1 = as_vgpr(ctx, src1);
 426    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 427       src2 = as_vgpr(ctx, src2);
 428
 429    Builder bld(ctx->program, ctx->block);
 430    bld.vop3(op, Definition(dst), src0, src1, src2);
 431 }
 432
 433 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 434 {
 435    Builder bld(ctx->program, ctx->block);
 436    bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 437 }
 438
 439 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 440 {
 441    Temp src0 = get_alu_src(ctx, instr->src[0]);
 442    Temp src1 = get_alu_src(ctx, instr->src[1]);
 443    aco_ptr<Instruction> vopc;
 444    if (src1.type() == RegType::sgpr) {
 445       if (src0.type() == RegType::vgpr) {
 446          /* to swap the operands, we might also have to change the opcode */
 447          switch (op) {
 448             case aco_opcode::v_cmp_lt_f32:
 449                op = aco_opcode::v_cmp_gt_f32;
 450                break;
 451             case aco_opcode::v_cmp_ge_f32:
 452                op = aco_opcode::v_cmp_le_f32;
 453                break;
 454             case aco_opcode::v_cmp_lt_i32:
 455                op = aco_opcode::v_cmp_gt_i32;
 456                break;
 457             case aco_opcode::v_cmp_ge_i32:
 458                op = aco_opcode::v_cmp_le_i32;
 459                break;
 460             case aco_opcode::v_cmp_lt_u32:
 461                op = aco_opcode::v_cmp_gt_u32;
 462                break;
 463             case aco_opcode::v_cmp_ge_u32:
 464                op = aco_opcode::v_cmp_le_u32;
 465                break;
 466             case aco_opcode::v_cmp_lt_f64:
 467                op = aco_opcode::v_cmp_gt_f64;
 468                break;
 469             case aco_opcode::v_cmp_ge_f64:
 470                op = aco_opcode::v_cmp_le_f64;
 471                break;
 472             case aco_opcode::v_cmp_lt_i64:
 473                op = aco_opcode::v_cmp_gt_i64;
 474                break;
 475             case aco_opcode::v_cmp_ge_i64:
 476                op = aco_opcode::v_cmp_le_i64;
 477                break;
 478             case aco_opcode::v_cmp_lt_u64:
 479                op = aco_opcode::v_cmp_gt_u64;
 480                break;
 481             case aco_opcode::v_cmp_ge_u64:
 482                op = aco_opcode::v_cmp_le_u64;
 483                break;
 484             default: /* eq and ne are commutative */
 485                break;
 486          }
 487          Temp t = src0;
 488          src0 = src1;
 489          src1 = t;
 490       } else {
 491          src1 = as_vgpr(ctx, src1);
 492       }
 493    }
 494    Builder bld(ctx->program, ctx->block);
 495    bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
 496 }
 497
 498 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 499 {
 500    if (dst.regClass() == s2) {
 501       emit_vopc_instruction(ctx, instr, op, dst);
 502       if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
 503          emit_split_vector(ctx, dst, 2);
 504    } else if (dst.regClass() == s1) {
 505       Temp src0 = get_alu_src(ctx, instr->src[0]);
 506       Temp src1 = get_alu_src(ctx, instr->src[1]);
 507       assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
 508
 509       Builder bld(ctx->program, ctx->block);
 510       bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
 511
 512    } else {
 513       assert(false);
 514    }
 515 }
 516
 517 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
 518 {
 519    Builder bld(ctx->program, ctx->block);
 520    Temp src0 = get_alu_src(ctx, instr->src[0]);
 521    Temp src1 = get_alu_src(ctx, instr->src[1]);
 522    if (dst.regClass() == s2) {
 523       bld.sop2(op64, Definition(dst), bld.def(s1, scc),
 524                as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
 525    } else {
 526       assert(dst.regClass() == s1);
 527       bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
 528                as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
 529    }
 530 }
 531
 532
 533 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 534 {
 535    Builder bld(ctx->program, ctx->block);
 536    Temp cond = get_alu_src(ctx, instr->src[0]);
 537    Temp then = get_alu_src(ctx, instr->src[1]);
 538    Temp els = get_alu_src(ctx, instr->src[2]);
 539
 540    if (dst.type() == RegType::vgpr) {
 541       cond = as_divergent_bool(ctx, cond, true);
 542
 543       aco_ptr<Instruction> bcsel;
 544       if (dst.size() == 1) {
 545          then = as_vgpr(ctx, then);
 546          els = as_vgpr(ctx, els);
 547
 548          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 549       } else if (dst.size() == 2) {
 550          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 551          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 552          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 553          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 554
 555          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 556          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 557
 558          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 559       } else {
 560          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 561          nir_print_instr(&instr->instr, stderr);
 562          fprintf(stderr, "\n");
 563       }
 564       return;
 565    }
 566
 567    if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
 568       if (dst.regClass() == s1 || dst.regClass() == s2) {
 569          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 570          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 571          bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
 572       } else {
 573          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 574          nir_print_instr(&instr->instr, stderr);
 575          fprintf(stderr, "\n");
 576       }
 577       return;
 578    }
 579
 580    /* boolean bcsel */
 581    assert(instr->dest.dest.ssa.bit_size == 1);
 582
 583    if (dst.regClass() == s1)
 584       cond = as_uniform_bool(ctx, cond);
 585
 586    if (cond.regClass() == s1) { /* uniform selection */
 587       aco_opcode op;
 588       if (dst.regClass() == s2) {
 589          op = aco_opcode::s_cselect_b64;
 590          then = as_divergent_bool(ctx, then, false);
 591          els = as_divergent_bool(ctx, els, false);
 592       } else {
 593          assert(dst.regClass() == s1);
 594          op = aco_opcode::s_cselect_b32;
 595          then = as_uniform_bool(ctx, then);
 596          els = as_uniform_bool(ctx, els);
 597       }
 598       bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
 599       return;
 600    }
 601
 602    /* divergent boolean bcsel
 603     * this implements bcsel on bools: dst = s0 ? s1 : s2
 604     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 605    assert (dst.regClass() == s2);
 606    then = as_divergent_bool(ctx, then, false);
 607    els = as_divergent_bool(ctx, els, false);
 608
 609    if (cond.id() != then.id())
 610       then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
 611
 612    if (cond.id() == els.id())
 613       bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
 614    else
 615       bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
 616                bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
 617 }
 618
 619 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 620 {
 621    if (!instr->dest.dest.is_ssa) {
 622       fprintf(stderr, "nir alu dst not in ssa: ");
 623       nir_print_instr(&instr->instr, stderr);
 624       fprintf(stderr, "\n");
 625       abort();
 626    }
 627    Builder bld(ctx->program, ctx->block);
 628    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
 629    switch(instr->op) {
 630    case nir_op_vec2:
 631    case nir_op_vec3:
 632    case nir_op_vec4: {
 633       std::array<Temp,4> elems;
 634       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
 635       for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
 636          elems[i] = get_alu_src(ctx, instr->src[i]);
 637          vec->operands[i] = Operand{elems[i]};
 638       }
 639       vec->definitions[0] = Definition(dst);
 640       ctx->block->instructions.emplace_back(std::move(vec));
 641       ctx->allocated_vec.emplace(dst.id(), elems);
 642       break;
 643    }
 644    case nir_op_mov: {
 645       Temp src = get_alu_src(ctx, instr->src[0]);
 646       aco_ptr<Instruction> mov;
 647       if (dst.type() == RegType::sgpr) {
 648          if (src.type() == RegType::vgpr)
 649             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
 650          else if (src.regClass() == s1)
 651             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 652          else if (src.regClass() == s2)
 653             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
 654          else
 655             unreachable("wrong src register class for nir_op_imov");
 656       } else if (dst.regClass() == v1) {
 657          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
 658       } else if (dst.regClass() == v2) {
 659          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
 660       } else {
 661          nir_print_instr(&instr->instr, stderr);
 662          unreachable("Should have been lowered to scalar.");
 663       }
 664       break;
 665    }
 666    case nir_op_inot: {
 667       Temp src = get_alu_src(ctx, instr->src[0]);
 668       /* uniform booleans */
 669       if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
 670          if (src.regClass() == s1) {
 671             /* in this case, src is either 1 or 0 */
 672             bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
 673          } else {
 674             /* src is either exec_mask or 0 */
 675             assert(src.regClass() == s2);
 676             bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
 677          }
 678       } else if (dst.regClass() == v1) {
 679          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
 680       } else if (dst.type() == RegType::sgpr) {
 681          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
 682          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
 683       } else {
 684          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 685          nir_print_instr(&instr->instr, stderr);
 686          fprintf(stderr, "\n");
 687       }
 688       break;
 689    }
 690    case nir_op_ineg: {
 691       Temp src = get_alu_src(ctx, instr->src[0]);
 692       if (dst.regClass() == v1) {
 693          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
 694       } else if (dst.regClass() == s1) {
 695          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
 696       } else if (dst.size() == 2) {
 697          Temp src0 = bld.tmp(dst.type(), 1);
 698          Temp src1 = bld.tmp(dst.type(), 1);
 699          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
 700
 701          if (dst.regClass() == s2) {
 702             Temp carry = bld.tmp(s1);
 703             Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), Operand(0u), src0);
 704             Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), src1, carry);
 705             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 706          } else {
 707             Temp lower = bld.tmp(v1);
 708             Temp borrow = bld.vsub32(Definition(lower), Operand(0u), src0, true).def(1).getTemp();
 709             Temp upper = bld.vsub32(bld.def(v1), Operand(0u), src1, false, borrow);
 710             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 711          }
 712       } else {
 713          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 714          nir_print_instr(&instr->instr, stderr);
 715          fprintf(stderr, "\n");
 716       }
 717       break;
 718    }
 719    case nir_op_iabs: {
 720       if (dst.regClass() == s1) {
 721          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
 722       } else if (dst.regClass() == v1) {
 723          Temp src = get_alu_src(ctx, instr->src[0]);
 724          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
 725       } else {
 726          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 727          nir_print_instr(&instr->instr, stderr);
 728          fprintf(stderr, "\n");
 729       }
 730       break;
 731    }
 732    case nir_op_isign: {
 733       Temp src = get_alu_src(ctx, instr->src[0]);
 734       if (dst.regClass() == s1) {
 735          Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
 736          Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
 737          bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
 738       } else if (dst.regClass() == s2) {
 739          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
 740          Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
 741          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
 742       } else if (dst.regClass() == v1) {
 743          Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
 744          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 745          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
 746       } else if (dst.regClass() == v2) {
 747          Temp upper = emit_extract_vector(ctx, src, 1, v1);
 748          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
 749          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 750          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
 751          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
 752          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 753       } else {
 754          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 755          nir_print_instr(&instr->instr, stderr);
 756          fprintf(stderr, "\n");
 757       }
 758       break;
 759    }
 760    case nir_op_imax: {
 761       if (dst.regClass() == v1) {
 762          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
 763       } else if (dst.regClass() == s1) {
 764          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
 765       } else {
 766          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 767          nir_print_instr(&instr->instr, stderr);
 768          fprintf(stderr, "\n");
 769       }
 770       break;
 771    }
 772    case nir_op_umax: {
 773       if (dst.regClass() == v1) {
 774          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
 775       } else if (dst.regClass() == s1) {
 776          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
 777       } else {
 778          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 779          nir_print_instr(&instr->instr, stderr);
 780          fprintf(stderr, "\n");
 781       }
 782       break;
 783    }
 784    case nir_op_imin: {
 785       if (dst.regClass() == v1) {
 786          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
 787       } else if (dst.regClass() == s1) {
 788          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
 789       } else {
 790          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 791          nir_print_instr(&instr->instr, stderr);
 792          fprintf(stderr, "\n");
 793       }
 794       break;
 795    }
 796    case nir_op_umin: {
 797       if (dst.regClass() == v1) {
 798          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
 799       } else if (dst.regClass() == s1) {
 800          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
 801       } else {
 802          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 803          nir_print_instr(&instr->instr, stderr);
 804          fprintf(stderr, "\n");
 805       }
 806       break;
 807    }
 808    case nir_op_ior: {
 809       if (instr->dest.dest.ssa.bit_size == 1) {
 810          emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
 811       } else if (dst.regClass() == v1) {
 812          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
 813       } else if (dst.regClass() == s1) {
 814          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
 815       } else if (dst.regClass() == s2) {
 816          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
 817       } else {
 818          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 819          nir_print_instr(&instr->instr, stderr);
 820          fprintf(stderr, "\n");
 821       }
 822       break;
 823    }
 824    case nir_op_iand: {
 825       if (instr->dest.dest.ssa.bit_size == 1) {
 826          emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
 827       } else if (dst.regClass() == v1) {
 828          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
 829       } else if (dst.regClass() == s1) {
 830          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
 831       } else if (dst.regClass() == s2) {
 832          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
 833       } else {
 834          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 835          nir_print_instr(&instr->instr, stderr);
 836          fprintf(stderr, "\n");
 837       }
 838       break;
 839    }
 840    case nir_op_ixor: {
 841       if (instr->dest.dest.ssa.bit_size == 1) {
 842          emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
 843       } else if (dst.regClass() == v1) {
 844          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
 845       } else if (dst.regClass() == s1) {
 846          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
 847       } else if (dst.regClass() == s2) {
 848          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
 849       } else {
 850          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 851          nir_print_instr(&instr->instr, stderr);
 852          fprintf(stderr, "\n");
 853       }
 854       break;
 855    }
 856    case nir_op_ushr: {
 857       if (dst.regClass() == v1) {
 858          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
 859       } else if (dst.regClass() == v2) {
 860          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
 861                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 862       } else if (dst.regClass() == s2) {
 863          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
 864       } else if (dst.regClass() == s1) {
 865          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
 866       } else {
 867          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 868          nir_print_instr(&instr->instr, stderr);
 869          fprintf(stderr, "\n");
 870       }
 871       break;
 872    }
 873    case nir_op_ishl: {
 874       if (dst.regClass() == v1) {
 875          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
 876       } else if (dst.regClass() == v2) {
 877          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
 878                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 879       } else if (dst.regClass() == s1) {
 880          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
 881       } else if (dst.regClass() == s2) {
 882          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
 883       } else {
 884          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 885          nir_print_instr(&instr->instr, stderr);
 886          fprintf(stderr, "\n");
 887       }
 888       break;
 889    }
 890    case nir_op_ishr: {
 891       if (dst.regClass() == v1) {
 892          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
 893       } else if (dst.regClass() == v2) {
 894          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
 895                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 896       } else if (dst.regClass() == s1) {
 897          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
 898       } else if (dst.regClass() == s2) {
 899          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
 900       } else {
 901          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 902          nir_print_instr(&instr->instr, stderr);
 903          fprintf(stderr, "\n");
 904       }
 905       break;
 906    }
 907    case nir_op_find_lsb: {
 908       Temp src = get_alu_src(ctx, instr->src[0]);
 909       if (src.regClass() == s1) {
 910          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
 911       } else if (src.regClass() == v1) {
 912          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
 913       } else if (src.regClass() == s2) {
 914          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
 915       } else {
 916          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 917          nir_print_instr(&instr->instr, stderr);
 918          fprintf(stderr, "\n");
 919       }
 920       break;
 921    }
 922    case nir_op_ufind_msb:
 923    case nir_op_ifind_msb: {
 924       Temp src = get_alu_src(ctx, instr->src[0]);
 925       if (src.regClass() == s1 || src.regClass() == s2) {
 926          aco_opcode op = src.regClass() == s2 ?
 927                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
 928                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
 929          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
 930
 931          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
 932                                         Operand(src.size() * 32u - 1u), msb_rev);
 933          Temp msb = sub.def(0).getTemp();
 934          Temp carry = sub.def(1).getTemp();
 935
 936          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
 937       } else if (src.regClass() == v1) {
 938          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
 939          Temp msb_rev = bld.tmp(v1);
 940          emit_vop1_instruction(ctx, instr, op, msb_rev);
 941          Temp msb = bld.tmp(v1);
 942          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
 943          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
 944       } else {
 945          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 946          nir_print_instr(&instr->instr, stderr);
 947          fprintf(stderr, "\n");
 948       }
 949       break;
 950    }
 951    case nir_op_bitfield_reverse: {
 952       if (dst.regClass() == s1) {
 953          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 954       } else if (dst.regClass() == v1) {
 955          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 956       } else {
 957          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 958          nir_print_instr(&instr->instr, stderr);
 959          fprintf(stderr, "\n");
 960       }
 961       break;
 962    }
 963    case nir_op_iadd: {
 964       if (dst.regClass() == s1) {
 965          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
 966          break;
 967       }
 968
 969       Temp src0 = get_alu_src(ctx, instr->src[0]);
 970       Temp src1 = get_alu_src(ctx, instr->src[1]);
 971       if (dst.regClass() == v1) {
 972          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
 973          break;
 974       }
 975
 976       assert(src0.size() == 2 && src1.size() == 2);
 977       Temp src00 = bld.tmp(src0.type(), 1);
 978       Temp src01 = bld.tmp(dst.type(), 1);
 979       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 980       Temp src10 = bld.tmp(src1.type(), 1);
 981       Temp src11 = bld.tmp(dst.type(), 1);
 982       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 983
 984       if (dst.regClass() == s2) {
 985          Temp carry = bld.tmp(s1);
 986          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
 987          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
 988          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 989       } else if (dst.regClass() == v2) {
 990          Temp dst0 = bld.tmp(v1);
 991          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
 992          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
 993          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 994       } else {
 995          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 996          nir_print_instr(&instr->instr, stderr);
 997          fprintf(stderr, "\n");
 998       }
 999       break;
1000    }
1001    case nir_op_uadd_sat: {
1002       Temp src0 = get_alu_src(ctx, instr->src[0]);
1003       Temp src1 = get_alu_src(ctx, instr->src[1]);
1004       if (dst.regClass() == s1) {
1005          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
1006          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
1007                   src0, src1);
1008          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
1009       } else if (dst.regClass() == v1) {
1010          if (ctx->options->chip_class >= GFX9) {
1011             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
1012             add->operands[0] = Operand(src0);
1013             add->operands[1] = Operand(src1);
1014             add->definitions[0] = Definition(dst);
1015             add->clamp = 1;
1016             ctx->block->instructions.emplace_back(std::move(add));
1017          } else {
1018             if (src1.regClass() != v1)
1019                std::swap(src0, src1);
1020             assert(src1.regClass() == v1);
1021             Temp tmp = bld.tmp(v1);
1022             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1023             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1024          }
1025       } else {
1026          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1027          nir_print_instr(&instr->instr, stderr);
1028          fprintf(stderr, "\n");
1029       }
1030       break;
1031    }
1032    case nir_op_uadd_carry: {
1033       Temp src0 = get_alu_src(ctx, instr->src[0]);
1034       Temp src1 = get_alu_src(ctx, instr->src[1]);
1035       if (dst.regClass() == s1) {
1036          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1037          break;
1038       }
1039       if (dst.regClass() == v1) {
1040          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1041          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1042          break;
1043       }
1044
1045       Temp src00 = bld.tmp(src0.type(), 1);
1046       Temp src01 = bld.tmp(dst.type(), 1);
1047       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1048       Temp src10 = bld.tmp(src1.type(), 1);
1049       Temp src11 = bld.tmp(dst.type(), 1);
1050       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1051       if (dst.regClass() == s2) {
1052          Temp carry = bld.tmp(s1);
1053          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1054          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1055          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1056       } else if (dst.regClass() == v2) {
1057          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1058          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1059          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1060          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1061       } else {
1062          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1063          nir_print_instr(&instr->instr, stderr);
1064          fprintf(stderr, "\n");
1065       }
1066       break;
1067    }
1068    case nir_op_isub: {
1069       if (dst.regClass() == s1) {
1070          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1071          break;
1072       }
1073
1074       Temp src0 = get_alu_src(ctx, instr->src[0]);
1075       Temp src1 = get_alu_src(ctx, instr->src[1]);
1076       if (dst.regClass() == v1) {
1077          bld.vsub32(Definition(dst), src0, src1);
1078          break;
1079       }
1080
1081       Temp src00 = bld.tmp(src0.type(), 1);
1082       Temp src01 = bld.tmp(dst.type(), 1);
1083       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1084       Temp src10 = bld.tmp(src1.type(), 1);
1085       Temp src11 = bld.tmp(dst.type(), 1);
1086       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1087       if (dst.regClass() == s2) {
1088          Temp carry = bld.tmp(s1);
1089          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1090          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1091          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1092       } else if (dst.regClass() == v2) {
1093          Temp lower = bld.tmp(v1);
1094          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1095          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1096          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1097       } else {
1098          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1099          nir_print_instr(&instr->instr, stderr);
1100          fprintf(stderr, "\n");
1101       }
1102       break;
1103    }
1104    case nir_op_usub_borrow: {
1105       Temp src0 = get_alu_src(ctx, instr->src[0]);
1106       Temp src1 = get_alu_src(ctx, instr->src[1]);
1107       if (dst.regClass() == s1) {
1108          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1109          break;
1110       } else if (dst.regClass() == v1) {
1111          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1112          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1113          break;
1114       }
1115
1116       Temp src00 = bld.tmp(src0.type(), 1);
1117       Temp src01 = bld.tmp(dst.type(), 1);
1118       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1119       Temp src10 = bld.tmp(src1.type(), 1);
1120       Temp src11 = bld.tmp(dst.type(), 1);
1121       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1122       if (dst.regClass() == s2) {
1123          Temp borrow = bld.tmp(s1);
1124          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1125          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1126          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1127       } else if (dst.regClass() == v2) {
1128          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1129          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1130          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1131          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1132       } else {
1133          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1134          nir_print_instr(&instr->instr, stderr);
1135          fprintf(stderr, "\n");
1136       }
1137       break;
1138    }
1139    case nir_op_imul: {
1140       if (dst.regClass() == v1) {
1141          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1142                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1143       } else if (dst.regClass() == s1) {
1144          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1145       } else {
1146          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1147          nir_print_instr(&instr->instr, stderr);
1148          fprintf(stderr, "\n");
1149       }
1150       break;
1151    }
1152    case nir_op_umul_high: {
1153       if (dst.regClass() == v1) {
1154          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1155       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1156          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1157       } else if (dst.regClass() == s1) {
1158          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1159                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1160          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1161       } else {
1162          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1163          nir_print_instr(&instr->instr, stderr);
1164          fprintf(stderr, "\n");
1165       }
1166       break;
1167    }
1168    case nir_op_imul_high: {
1169       if (dst.regClass() == v1) {
1170          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1171       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1172          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1173       } else if (dst.regClass() == s1) {
1174          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1175                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1176          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1177       } else {
1178          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1179          nir_print_instr(&instr->instr, stderr);
1180          fprintf(stderr, "\n");
1181       }
1182       break;
1183    }
1184    case nir_op_fmul: {
1185       if (dst.size() == 1) {
1186          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1187       } else if (dst.size() == 2) {
1188          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1189                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1190       } else {
1191          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1192          nir_print_instr(&instr->instr, stderr);
1193          fprintf(stderr, "\n");
1194       }
1195       break;
1196    }
1197    case nir_op_fadd: {
1198       if (dst.size() == 1) {
1199          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1200       } else if (dst.size() == 2) {
1201          bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1202                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1203       } else {
1204          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1205          nir_print_instr(&instr->instr, stderr);
1206          fprintf(stderr, "\n");
1207       }
1208       break;
1209    }
1210    case nir_op_fsub: {
1211       Temp src0 = get_alu_src(ctx, instr->src[0]);
1212       Temp src1 = get_alu_src(ctx, instr->src[1]);
1213       if (dst.size() == 1) {
1214          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1215             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1216          else
1217             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1218       } else if (dst.size() == 2) {
1219          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1220                                      get_alu_src(ctx, instr->src[0]),
1221                                      as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1222          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1223          sub->neg[1] = true;
1224       } else {
1225          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1226          nir_print_instr(&instr->instr, stderr);
1227          fprintf(stderr, "\n");
1228       }
1229       break;
1230    }
1231    case nir_op_fmax: {
1232       if (dst.size() == 1) {
1233          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1234       } else if (dst.size() == 2) {
1235          bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1236                   get_alu_src(ctx, instr->src[0]),
1237                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1238       } else {
1239          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1240          nir_print_instr(&instr->instr, stderr);
1241          fprintf(stderr, "\n");
1242       }
1243       break;
1244    }
1245    case nir_op_fmin: {
1246       if (dst.size() == 1) {
1247          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1248       } else if (dst.size() == 2) {
1249          bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1250                   get_alu_src(ctx, instr->src[0]),
1251                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1252       } else {
1253          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1254          nir_print_instr(&instr->instr, stderr);
1255          fprintf(stderr, "\n");
1256       }
1257       break;
1258    }
1259    case nir_op_fmax3: {
1260       if (dst.size() == 1) {
1261          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1262       } else {
1263          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1264          nir_print_instr(&instr->instr, stderr);
1265          fprintf(stderr, "\n");
1266       }
1267       break;
1268    }
1269    case nir_op_fmin3: {
1270       if (dst.size() == 1) {
1271          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1272       } else {
1273          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1274          nir_print_instr(&instr->instr, stderr);
1275          fprintf(stderr, "\n");
1276       }
1277       break;
1278    }
1279    case nir_op_fmed3: {
1280       if (dst.size() == 1) {
1281          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1282       } else {
1283          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1284          nir_print_instr(&instr->instr, stderr);
1285          fprintf(stderr, "\n");
1286       }
1287       break;
1288    }
1289    case nir_op_umax3: {
1290       if (dst.size() == 1) {
1291          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1292       } else {
1293          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1294          nir_print_instr(&instr->instr, stderr);
1295          fprintf(stderr, "\n");
1296       }
1297       break;
1298    }
1299    case nir_op_umin3: {
1300       if (dst.size() == 1) {
1301          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1302       } else {
1303          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1304          nir_print_instr(&instr->instr, stderr);
1305          fprintf(stderr, "\n");
1306       }
1307       break;
1308    }
1309    case nir_op_umed3: {
1310       if (dst.size() == 1) {
1311          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1312       } else {
1313          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1314          nir_print_instr(&instr->instr, stderr);
1315          fprintf(stderr, "\n");
1316       }
1317       break;
1318    }
1319    case nir_op_imax3: {
1320       if (dst.size() == 1) {
1321          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1322       } else {
1323          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1324          nir_print_instr(&instr->instr, stderr);
1325          fprintf(stderr, "\n");
1326       }
1327       break;
1328    }
1329    case nir_op_imin3: {
1330       if (dst.size() == 1) {
1331          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1332       } else {
1333          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1334          nir_print_instr(&instr->instr, stderr);
1335          fprintf(stderr, "\n");
1336       }
1337       break;
1338    }
1339    case nir_op_imed3: {
1340       if (dst.size() == 1) {
1341          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1342       } else {
1343          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1344          nir_print_instr(&instr->instr, stderr);
1345          fprintf(stderr, "\n");
1346       }
1347       break;
1348    }
1349    case nir_op_cube_face_coord: {
1350       Temp in = get_alu_src(ctx, instr->src[0], 3);
1351       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1352                       emit_extract_vector(ctx, in, 1, v1),
1353                       emit_extract_vector(ctx, in, 2, v1) };
1354       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1355       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1356       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1357       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1358       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1359       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1360       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1361       break;
1362    }
1363    case nir_op_cube_face_index: {
1364       Temp in = get_alu_src(ctx, instr->src[0], 3);
1365       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1366                       emit_extract_vector(ctx, in, 1, v1),
1367                       emit_extract_vector(ctx, in, 2, v1) };
1368       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1369       break;
1370    }
1371    case nir_op_bcsel: {
1372       emit_bcsel(ctx, instr, dst);
1373       break;
1374    }
1375    case nir_op_frsq: {
1376       if (dst.size() == 1) {
1377          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
1378       } else if (dst.size() == 2) {
1379          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1380       } else {
1381          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1382          nir_print_instr(&instr->instr, stderr);
1383          fprintf(stderr, "\n");
1384       }
1385       break;
1386    }
1387    case nir_op_fneg: {
1388       Temp src = get_alu_src(ctx, instr->src[0]);
1389       if (dst.size() == 1) {
1390          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1391       } else if (dst.size() == 2) {
1392          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1393          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1394          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1395          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1396       } else {
1397          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1398          nir_print_instr(&instr->instr, stderr);
1399          fprintf(stderr, "\n");
1400       }
1401       break;
1402    }
1403    case nir_op_fabs: {
1404       Temp src = get_alu_src(ctx, instr->src[0]);
1405       if (dst.size() == 1) {
1406          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1407       } else if (dst.size() == 2) {
1408          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1409          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1410          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1411          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1412       } else {
1413          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1414          nir_print_instr(&instr->instr, stderr);
1415          fprintf(stderr, "\n");
1416       }
1417       break;
1418    }
1419    case nir_op_fsat: {
1420       Temp src = get_alu_src(ctx, instr->src[0]);
1421       if (dst.size() == 1) {
1422          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1423       } else if (dst.size() == 2) {
1424          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1425          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1426          vop3->clamp = true;
1427       } else {
1428          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1429          nir_print_instr(&instr->instr, stderr);
1430          fprintf(stderr, "\n");
1431       }
1432       break;
1433    }
1434    case nir_op_flog2: {
1435       if (dst.size() == 1) {
1436          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
1437       } else {
1438          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1439          nir_print_instr(&instr->instr, stderr);
1440          fprintf(stderr, "\n");
1441       }
1442       break;
1443    }
1444    case nir_op_frcp: {
1445       if (dst.size() == 1) {
1446          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
1447       } else if (dst.size() == 2) {
1448          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1449       } else {
1450          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1451          nir_print_instr(&instr->instr, stderr);
1452          fprintf(stderr, "\n");
1453       }
1454       break;
1455    }
1456    case nir_op_fexp2: {
1457       if (dst.size() == 1) {
1458          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1459       } else {
1460          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1461          nir_print_instr(&instr->instr, stderr);
1462          fprintf(stderr, "\n");
1463       }
1464       break;
1465    }
1466    case nir_op_fsqrt: {
1467       if (dst.size() == 1) {
1468          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
1469       } else if (dst.size() == 2) {
1470          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1471       } else {
1472          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1473          nir_print_instr(&instr->instr, stderr);
1474          fprintf(stderr, "\n");
1475       }
1476       break;
1477    }
1478    case nir_op_ffract: {
1479       if (dst.size() == 1) {
1480          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1481       } else if (dst.size() == 2) {
1482          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1483       } else {
1484          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1485          nir_print_instr(&instr->instr, stderr);
1486          fprintf(stderr, "\n");
1487       }
1488       break;
1489    }
1490    case nir_op_ffloor: {
1491       if (dst.size() == 1) {
1492          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1493       } else if (dst.size() == 2) {
1494          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1495       } else {
1496          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1497          nir_print_instr(&instr->instr, stderr);
1498          fprintf(stderr, "\n");
1499       }
1500       break;
1501    }
1502    case nir_op_fceil: {
1503       if (dst.size() == 1) {
1504          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1505       } else if (dst.size() == 2) {
1506          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1507       } else {
1508          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1509          nir_print_instr(&instr->instr, stderr);
1510          fprintf(stderr, "\n");
1511       }
1512       break;
1513    }
1514    case nir_op_ftrunc: {
1515       if (dst.size() == 1) {
1516          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1517       } else if (dst.size() == 2) {
1518          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1519       } else {
1520          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1521          nir_print_instr(&instr->instr, stderr);
1522          fprintf(stderr, "\n");
1523       }
1524       break;
1525    }
1526    case nir_op_fround_even: {
1527       if (dst.size() == 1) {
1528          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1529       } else if (dst.size() == 2) {
1530          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1531       } else {
1532          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1533          nir_print_instr(&instr->instr, stderr);
1534          fprintf(stderr, "\n");
1535       }
1536       break;
1537    }
1538    case nir_op_fsin:
1539    case nir_op_fcos: {
1540       Temp src = get_alu_src(ctx, instr->src[0]);
1541       aco_ptr<Instruction> norm;
1542       if (dst.size() == 1) {
1543          Temp tmp;
1544          Operand half_pi(0x3e22f983u);
1545          if (src.type() == RegType::sgpr)
1546             tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1547          else
1548             tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1549
1550          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1551          if (ctx->options->chip_class < GFX9)
1552             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1553
1554          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1555          bld.vop1(opcode, Definition(dst), tmp);
1556       } else {
1557          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1558          nir_print_instr(&instr->instr, stderr);
1559          fprintf(stderr, "\n");
1560       }
1561       break;
1562    }
1563    case nir_op_ldexp: {
1564       if (dst.size() == 1) {
1565          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1566                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1567                   get_alu_src(ctx, instr->src[1]));
1568       } else if (dst.size() == 2) {
1569          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1570                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1571                   get_alu_src(ctx, instr->src[1]));
1572       } else {
1573          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1574          nir_print_instr(&instr->instr, stderr);
1575          fprintf(stderr, "\n");
1576       }
1577       break;
1578    }
1579    case nir_op_frexp_sig: {
1580       if (dst.size() == 1) {
1581          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1582                   get_alu_src(ctx, instr->src[0]));
1583       } else if (dst.size() == 2) {
1584          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1585                   get_alu_src(ctx, instr->src[0]));
1586       } else {
1587          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1588          nir_print_instr(&instr->instr, stderr);
1589          fprintf(stderr, "\n");
1590       }
1591       break;
1592    }
1593    case nir_op_frexp_exp: {
1594       if (instr->src[0].src.ssa->bit_size == 32) {
1595          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1596                   get_alu_src(ctx, instr->src[0]));
1597       } else if (instr->src[0].src.ssa->bit_size == 64) {
1598          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1599                   get_alu_src(ctx, instr->src[0]));
1600       } else {
1601          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1602          nir_print_instr(&instr->instr, stderr);
1603          fprintf(stderr, "\n");
1604       }
1605       break;
1606    }
1607    case nir_op_fsign: {
1608       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1609       if (dst.size() == 1) {
1610          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1611          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1612          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1613          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1614       } else if (dst.size() == 2) {
1615          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1616          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1617          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond);
1618
1619          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1620          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1621          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1622
1623          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1624       } else {
1625          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1626          nir_print_instr(&instr->instr, stderr);
1627          fprintf(stderr, "\n");
1628       }
1629       break;
1630    }
1631    case nir_op_f2f32: {
1632       if (instr->src[0].src.ssa->bit_size == 64) {
1633          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1634       } else {
1635          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1636          nir_print_instr(&instr->instr, stderr);
1637          fprintf(stderr, "\n");
1638       }
1639       break;
1640    }
1641    case nir_op_f2f64: {
1642       if (instr->src[0].src.ssa->bit_size == 32) {
1643          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1644       } else {
1645          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1646          nir_print_instr(&instr->instr, stderr);
1647          fprintf(stderr, "\n");
1648       }
1649       break;
1650    }
1651    case nir_op_i2f32: {
1652       assert(dst.size() == 1);
1653       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1654       break;
1655    }
1656    case nir_op_i2f64: {
1657       if (instr->src[0].src.ssa->bit_size == 32) {
1658          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1659       } else if (instr->src[0].src.ssa->bit_size == 64) {
1660          Temp src = get_alu_src(ctx, instr->src[0]);
1661          RegClass rc = RegClass(src.type(), 1);
1662          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1663          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1664          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1665          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1666          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1667          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1668
1669       } else {
1670          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1671          nir_print_instr(&instr->instr, stderr);
1672          fprintf(stderr, "\n");
1673       }
1674       break;
1675    }
1676    case nir_op_u2f32: {
1677       assert(dst.size() == 1);
1678       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1679       break;
1680    }
1681    case nir_op_u2f64: {
1682       if (instr->src[0].src.ssa->bit_size == 32) {
1683          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1684       } else if (instr->src[0].src.ssa->bit_size == 64) {
1685          Temp src = get_alu_src(ctx, instr->src[0]);
1686          RegClass rc = RegClass(src.type(), 1);
1687          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1688          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1689          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1690          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1691          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1692          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1693       } else {
1694          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1695          nir_print_instr(&instr->instr, stderr);
1696          fprintf(stderr, "\n");
1697       }
1698       break;
1699    }
1700    case nir_op_f2i32: {
1701       Temp src = get_alu_src(ctx, instr->src[0]);
1702       if (instr->src[0].src.ssa->bit_size == 32) {
1703          if (dst.type() == RegType::vgpr)
1704             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1705          else
1706             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1707                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1708
1709       } else if (instr->src[0].src.ssa->bit_size == 64) {
1710          if (dst.type() == RegType::vgpr)
1711             bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1712          else
1713             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1714                        bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1715
1716       } else {
1717          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1718          nir_print_instr(&instr->instr, stderr);
1719          fprintf(stderr, "\n");
1720       }
1721       break;
1722    }
1723    case nir_op_f2u32: {
1724       Temp src = get_alu_src(ctx, instr->src[0]);
1725       if (instr->src[0].src.ssa->bit_size == 32) {
1726          if (dst.type() == RegType::vgpr)
1727             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1728          else
1729             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1730                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1731
1732       } else if (instr->src[0].src.ssa->bit_size == 64) {
1733          if (dst.type() == RegType::vgpr)
1734             bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1735          else
1736             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1737                        bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1738
1739       } else {
1740          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1741          nir_print_instr(&instr->instr, stderr);
1742          fprintf(stderr, "\n");
1743       }
1744       break;
1745    }
1746    case nir_op_f2i64: {
1747       Temp src = get_alu_src(ctx, instr->src[0]);
1748       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1749          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1750          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1751          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1752          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1753          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1754          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1755          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1756          Temp new_exponent = bld.tmp(v1);
1757          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1758          mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1759          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1760          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1761          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1762          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1763          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1764          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1765          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1766          Temp new_lower = bld.tmp(v1);
1767          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1768          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1769          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1770
1771       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1772          if (src.type() == RegType::vgpr)
1773             src = bld.as_uniform(src);
1774          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1775          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1776          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1777          exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1778          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1779          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1780          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1781          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1782          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1783          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1784          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1785          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1786          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1787          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1788          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1789          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1790          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1791          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1792          Temp borrow = bld.tmp(s1);
1793          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1794          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1795          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1796
1797       } else if (instr->src[0].src.ssa->bit_size == 64) {
1798          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1799          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1800          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1801          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1802          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1803          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1804          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1805          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1806          if (dst.type() == RegType::sgpr) {
1807             lower = bld.as_uniform(lower);
1808             upper = bld.as_uniform(upper);
1809          }
1810          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1811
1812       } else {
1813          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1814          nir_print_instr(&instr->instr, stderr);
1815          fprintf(stderr, "\n");
1816       }
1817       break;
1818    }
1819    case nir_op_f2u64: {
1820       Temp src = get_alu_src(ctx, instr->src[0]);
1821       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1822          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1823          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1824          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1825          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1826          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1827          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1828          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1829          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1830          Temp new_exponent = bld.tmp(v1);
1831          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1832          mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1833          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1834          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1835          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1836          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1837          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1838          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1839          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1840
1841       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1842          if (src.type() == RegType::vgpr)
1843             src = bld.as_uniform(src);
1844          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1845          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1846          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1847          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1848          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1849          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1850          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1851          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1852          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1853          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1854          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1855          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1856          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1857          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1858          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1859          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1860          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1861          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1862
1863       } else if (instr->src[0].src.ssa->bit_size == 64) {
1864          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1865          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1866          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1867          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1868          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1869          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1870          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1871          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1872          if (dst.type() == RegType::sgpr) {
1873             lower = bld.as_uniform(lower);
1874             upper = bld.as_uniform(upper);
1875          }
1876          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1877
1878       } else {
1879          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1880          nir_print_instr(&instr->instr, stderr);
1881          fprintf(stderr, "\n");
1882       }
1883       break;
1884    }
1885    case nir_op_b2f32: {
1886       Temp src = get_alu_src(ctx, instr->src[0]);
1887       if (dst.regClass() == s1) {
1888          src = as_uniform_bool(ctx, src);
1889          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1890       } else if (dst.regClass() == v1) {
1891          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
1892                       as_divergent_bool(ctx, src, true));
1893       } else {
1894          unreachable("Wrong destination register class for nir_op_b2f32.");
1895       }
1896       break;
1897    }
1898    case nir_op_b2f64: {
1899       Temp src = get_alu_src(ctx, instr->src[0]);
1900       if (dst.regClass() == s2) {
1901          src = as_uniform_bool(ctx, src);
1902          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1903       } else if (dst.regClass() == v2) {
1904          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1905          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
1906                       as_divergent_bool(ctx, src, true));
1907          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1908       } else {
1909          unreachable("Wrong destination register class for nir_op_b2f64.");
1910       }
1911       break;
1912    }
1913    case nir_op_i2i32: {
1914       Temp src = get_alu_src(ctx, instr->src[0]);
1915       if (instr->src[0].src.ssa->bit_size == 64) {
1916          /* we can actually just say dst = src, as it would map the lower register */
1917          emit_extract_vector(ctx, src, 0, dst);
1918       } else {
1919          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1920          nir_print_instr(&instr->instr, stderr);
1921          fprintf(stderr, "\n");
1922       }
1923       break;
1924    }
1925    case nir_op_u2u32: {
1926       Temp src = get_alu_src(ctx, instr->src[0]);
1927       if (instr->src[0].src.ssa->bit_size == 16) {
1928          if (dst.regClass() == s1) {
1929             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
1930          } else {
1931             // TODO: do better with SDWA
1932             bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
1933          }
1934       } else if (instr->src[0].src.ssa->bit_size == 64) {
1935          /* we can actually just say dst = src, as it would map the lower register */
1936          emit_extract_vector(ctx, src, 0, dst);
1937       } else {
1938          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1939          nir_print_instr(&instr->instr, stderr);
1940          fprintf(stderr, "\n");
1941       }
1942       break;
1943    }
1944    case nir_op_i2i64: {
1945       Temp src = get_alu_src(ctx, instr->src[0]);
1946       if (instr->src[0].src.ssa->bit_size == 32) {
1947          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1948       } else {
1949          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1950          nir_print_instr(&instr->instr, stderr);
1951          fprintf(stderr, "\n");
1952       }
1953       break;
1954    }
1955    case nir_op_u2u64: {
1956       Temp src = get_alu_src(ctx, instr->src[0]);
1957       if (instr->src[0].src.ssa->bit_size == 32) {
1958          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1959       } else {
1960          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1961          nir_print_instr(&instr->instr, stderr);
1962          fprintf(stderr, "\n");
1963       }
1964       break;
1965    }
1966    case nir_op_b2i32: {
1967       Temp src = get_alu_src(ctx, instr->src[0]);
1968       if (dst.regClass() == s1) {
1969          if (src.regClass() == s1) {
1970             bld.copy(Definition(dst), src);
1971          } else {
1972             // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
1973             assert(src.regClass() == s2);
1974             bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
1975          }
1976       } else {
1977          assert(dst.regClass() == v1 && src.regClass() == s2);
1978          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
1979       }
1980       break;
1981    }
1982    case nir_op_i2b1: {
1983       Temp src = get_alu_src(ctx, instr->src[0]);
1984       if (dst.regClass() == s2) {
1985          assert(src.regClass() == v1 || src.regClass() == v2);
1986          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
1987                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
1988       } else {
1989          assert(src.regClass() == s1 && dst.regClass() == s1);
1990          bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
1991       }
1992       break;
1993    }
1994    case nir_op_pack_64_2x32_split: {
1995       Temp src0 = get_alu_src(ctx, instr->src[0]);
1996       Temp src1 = get_alu_src(ctx, instr->src[1]);
1997
1998       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
1999       break;
2000    }
2001    case nir_op_unpack_64_2x32_split_x:
2002       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2003       break;
2004    case nir_op_unpack_64_2x32_split_y:
2005       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2006       break;
2007    case nir_op_pack_half_2x16: {
2008       Temp src = get_alu_src(ctx, instr->src[0], 2);
2009
2010       if (dst.regClass() == v1) {
2011          Temp src0 = bld.tmp(v1);
2012          Temp src1 = bld.tmp(v1);
2013          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2014          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2015
2016       } else {
2017          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2018          nir_print_instr(&instr->instr, stderr);
2019          fprintf(stderr, "\n");
2020       }
2021       break;
2022    }
2023    case nir_op_unpack_half_2x16_split_x: {
2024       if (dst.regClass() == v1) {
2025          Builder bld(ctx->program, ctx->block);
2026          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2027       } else {
2028          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2029          nir_print_instr(&instr->instr, stderr);
2030          fprintf(stderr, "\n");
2031       }
2032       break;
2033    }
2034    case nir_op_unpack_half_2x16_split_y: {
2035       if (dst.regClass() == v1) {
2036          Builder bld(ctx->program, ctx->block);
2037          /* TODO: use SDWA here */
2038          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2039                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2040       } else {
2041          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2042          nir_print_instr(&instr->instr, stderr);
2043          fprintf(stderr, "\n");
2044       }
2045       break;
2046    }
2047    case nir_op_fquantize2f16: {
2048       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
2049
2050       Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2051
2052       Temp cmp_res = bld.tmp(s2);
2053       bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2054
2055       Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2056
2057       bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2058       break;
2059    }
2060    case nir_op_bfm: {
2061       Temp bits = get_alu_src(ctx, instr->src[0]);
2062       Temp offset = get_alu_src(ctx, instr->src[1]);
2063
2064       if (dst.regClass() == s1) {
2065          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2066       } else if (dst.regClass() == v1) {
2067          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2068       } else {
2069          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2070          nir_print_instr(&instr->instr, stderr);
2071          fprintf(stderr, "\n");
2072       }
2073       break;
2074    }
2075    case nir_op_bitfield_select: {
2076       /* (mask & insert) | (~mask & base) */
2077       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2078       Temp insert = get_alu_src(ctx, instr->src[1]);
2079       Temp base = get_alu_src(ctx, instr->src[2]);
2080
2081       /* dst = (insert & bitmask) | (base & ~bitmask) */
2082       if (dst.regClass() == s1) {
2083          aco_ptr<Instruction> sop2;
2084          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2085          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2086          Operand lhs;
2087          if (const_insert && const_bitmask) {
2088             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2089          } else {
2090             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2091             lhs = Operand(insert);
2092          }
2093
2094          Operand rhs;
2095          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2096          if (const_base && const_bitmask) {
2097             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2098          } else {
2099             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2100             rhs = Operand(base);
2101          }
2102
2103          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2104
2105       } else if (dst.regClass() == v1) {
2106          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2107             base = as_vgpr(ctx, base);
2108          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2109             insert = as_vgpr(ctx, insert);
2110
2111          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2112
2113       } else {
2114          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2115          nir_print_instr(&instr->instr, stderr);
2116          fprintf(stderr, "\n");
2117       }
2118       break;
2119    }
2120    case nir_op_ubfe:
2121    case nir_op_ibfe: {
2122       Temp base = get_alu_src(ctx, instr->src[0]);
2123       Temp offset = get_alu_src(ctx, instr->src[1]);
2124       Temp bits = get_alu_src(ctx, instr->src[2]);
2125
2126       if (dst.type() == RegType::sgpr) {
2127          Operand extract;
2128          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2129          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2130          if (const_offset && const_bits) {
2131             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2132             extract = Operand(const_extract);
2133          } else {
2134             Operand width;
2135             if (const_bits) {
2136                width = Operand(const_bits->u32 << 16);
2137             } else {
2138                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2139             }
2140             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2141          }
2142
2143          aco_opcode opcode;
2144          if (dst.regClass() == s1) {
2145             if (instr->op == nir_op_ubfe)
2146                opcode = aco_opcode::s_bfe_u32;
2147             else
2148                opcode = aco_opcode::s_bfe_i32;
2149          } else if (dst.regClass() == s2) {
2150             if (instr->op == nir_op_ubfe)
2151                opcode = aco_opcode::s_bfe_u64;
2152             else
2153                opcode = aco_opcode::s_bfe_i64;
2154          } else {
2155             unreachable("Unsupported BFE bit size");
2156          }
2157
2158          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2159
2160       } else {
2161          aco_opcode opcode;
2162          if (dst.regClass() == v1) {
2163             if (instr->op == nir_op_ubfe)
2164                opcode = aco_opcode::v_bfe_u32;
2165             else
2166                opcode = aco_opcode::v_bfe_i32;
2167          } else {
2168             unreachable("Unsupported BFE bit size");
2169          }
2170
2171          emit_vop3a_instruction(ctx, instr, opcode, dst);
2172       }
2173       break;
2174    }
2175    case nir_op_bit_count: {
2176       Temp src = get_alu_src(ctx, instr->src[0]);
2177       if (src.regClass() == s1) {
2178          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2179       } else if (src.regClass() == v1) {
2180          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2181       } else if (src.regClass() == v2) {
2182          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2183                   emit_extract_vector(ctx, src, 1, v1),
2184                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2185                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2186       } else if (src.regClass() == s2) {
2187          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2188       } else {
2189          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2190          nir_print_instr(&instr->instr, stderr);
2191          fprintf(stderr, "\n");
2192       }
2193       break;
2194    }
2195    case nir_op_flt: {
2196       if (instr->src[0].src.ssa->bit_size == 32)
2197          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
2198       else if (instr->src[0].src.ssa->bit_size == 64)
2199          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
2200       break;
2201    }
2202    case nir_op_fge: {
2203       if (instr->src[0].src.ssa->bit_size == 32)
2204          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
2205       else if (instr->src[0].src.ssa->bit_size == 64)
2206          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
2207       break;
2208    }
2209    case nir_op_feq: {
2210       if (instr->src[0].src.ssa->bit_size == 32)
2211          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
2212       else if (instr->src[0].src.ssa->bit_size == 64)
2213          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
2214       break;
2215    }
2216    case nir_op_fne: {
2217       if (instr->src[0].src.ssa->bit_size == 32)
2218          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
2219       else if (instr->src[0].src.ssa->bit_size == 64)
2220          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
2221       break;
2222    }
2223    case nir_op_ilt: {
2224       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2225          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
2226       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2227          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
2228       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2229          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
2230       break;
2231    }
2232    case nir_op_ige: {
2233       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2234          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
2235       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2236          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
2237       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2238          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
2239       break;
2240    }
2241    case nir_op_ieq: {
2242       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2243          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
2244       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2245          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
2246       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2247          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
2248       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2249          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
2250       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2251          Temp src0 = get_alu_src(ctx, instr->src[0]);
2252          Temp src1 = get_alu_src(ctx, instr->src[1]);
2253          bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
2254                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2255       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2256          Temp src0 = get_alu_src(ctx, instr->src[0]);
2257          Temp src1 = get_alu_src(ctx, instr->src[1]);
2258          bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
2259                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2260       } else {
2261          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2262          nir_print_instr(&instr->instr, stderr);
2263          fprintf(stderr, "\n");
2264       }
2265       break;
2266    }
2267    case nir_op_ine: {
2268       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2269          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
2270       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2271          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
2272       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2273          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
2274       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2275          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
2276       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2277          Temp src0 = get_alu_src(ctx, instr->src[0]);
2278          Temp src1 = get_alu_src(ctx, instr->src[1]);
2279          bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
2280                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2281       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2282          Temp src0 = get_alu_src(ctx, instr->src[0]);
2283          Temp src1 = get_alu_src(ctx, instr->src[1]);
2284          bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
2285                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2286       } else {
2287          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2288          nir_print_instr(&instr->instr, stderr);
2289          fprintf(stderr, "\n");
2290       }
2291       break;
2292    }
2293    case nir_op_ult: {
2294       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2295          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
2296       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2297          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
2298       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2299          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
2300       break;
2301    }
2302    case nir_op_uge: {
2303       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2304          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
2305       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2306          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
2307       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2308          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
2309       break;
2310    }
2311    case nir_op_fddx:
2312    case nir_op_fddy:
2313    case nir_op_fddx_fine:
2314    case nir_op_fddy_fine:
2315    case nir_op_fddx_coarse:
2316    case nir_op_fddy_coarse: {
2317       Definition tl = bld.def(v1);
2318       uint16_t dpp_ctrl;
2319       if (instr->op == nir_op_fddx_fine) {
2320          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2321          dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2322       } else if (instr->op == nir_op_fddy_fine) {
2323          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2324          dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2325       } else {
2326          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2327          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2328             dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2329          else
2330             dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2331       }
2332
2333       Definition tmp = bld.def(v1);
2334       bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2335       emit_wqm(ctx, tmp.getTemp(), dst, true);
2336       break;
2337    }
2338    default:
2339       fprintf(stderr, "Unknown NIR ALU instr: ");
2340       nir_print_instr(&instr->instr, stderr);
2341       fprintf(stderr, "\n");
2342    }
2343 }
2344
2345 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2346 {
2347    Temp dst = get_ssa_temp(ctx, &instr->def);
2348
2349    // TODO: we really want to have the resulting type as this would allow for 64bit literals
2350    // which get truncated the lsb if double and msb if int
2351    // for now, we only use s_mov_b64 with 64bit inline constants
2352    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2353    assert(dst.type() == RegType::sgpr);
2354
2355    if (dst.size() == 1)
2356    {
2357       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
2358    } else {
2359       assert(dst.size() != 1);
2360       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2361       if (instr->def.bit_size == 64)
2362          for (unsigned i = 0; i < dst.size(); i++)
2363             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2364       else {
2365          for (unsigned i = 0; i < dst.size(); i++)
2366             vec->operands[i] = Operand{instr->value[i].u32};
2367       }
2368       vec->definitions[0] = Definition(dst);
2369       ctx->block->instructions.emplace_back(std::move(vec));
2370    }
2371 }
2372
2373 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2374 {
2375    uint32_t new_mask = 0;
2376    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2377       if (mask & (1u << i))
2378          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2379    return new_mask;
2380 }
2381
2382 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2383 {
2384    /* This wouldn't work inside control flow or with indirect offsets but
2385     * that doesn't happen because of nir_lower_io_to_temporaries(). */
2386
2387    unsigned write_mask = nir_intrinsic_write_mask(instr);
2388    unsigned component = nir_intrinsic_component(instr);
2389    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2390    unsigned idx = nir_intrinsic_base(instr) + component;
2391
2392    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2393    if (off_instr->type != nir_instr_type_load_const) {
2394       fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2395       nir_print_instr(off_instr, stderr);
2396       fprintf(stderr, "\n");
2397    }
2398    idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2399
2400    if (instr->src[0].ssa->bit_size == 64)
2401       write_mask = widen_mask(write_mask, 2);
2402
2403    for (unsigned i = 0; i < 8; ++i) {
2404       if (write_mask & (1 << i)) {
2405          ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2406          ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2407       }
2408       idx++;
2409    }
2410 }
2411
2412 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2413 {
2414    unsigned write_mask = nir_intrinsic_write_mask(instr);
2415    Operand values[4];
2416    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2417    for (unsigned i = 0; i < 4; ++i) {
2418       if (write_mask & (1 << i)) {
2419          Temp tmp = emit_extract_vector(ctx, src, i, v1);
2420          values[i] = Operand(tmp);
2421       } else {
2422          values[i] = Operand(v1);
2423       }
2424    }
2425
2426    unsigned index = nir_intrinsic_base(instr) / 4;
2427    unsigned target, col_format;
2428    unsigned enabled_channels = 0xF;
2429    aco_opcode compr_op = (aco_opcode)0;
2430
2431    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2432    assert(offset && "Non-const offsets on exports not yet supported");
2433    index += offset->u32;
2434
2435    assert(index != FRAG_RESULT_COLOR);
2436
2437    /* Unlike vertex shader exports, it's fine to use multiple exports to
2438     * export separate channels of one target. So shaders which export both
2439     * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2440     * TODO: combine the exports in those cases and create better code
2441     */
2442
2443    if (index == FRAG_RESULT_SAMPLE_MASK) {
2444
2445       if (ctx->program->info->ps.writes_z) {
2446          target = V_008DFC_SQ_EXP_MRTZ;
2447          enabled_channels = 0x4;
2448          col_format = (unsigned) -1;
2449
2450          values[2] = values[0];
2451          values[0] = Operand(v1);
2452       } else {
2453          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2454          exp->valid_mask = false;
2455          exp->done = false;
2456          exp->compressed = true;
2457          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2458          exp->enabled_mask = 0xc;
2459          for (int i = 0; i < 4; i++)
2460             exp->operands[i] = Operand(v1);
2461          exp->operands[1] = Operand(values[0]);
2462          ctx->block->instructions.emplace_back(std::move(exp));
2463          return;
2464       }
2465
2466    } else if (index == FRAG_RESULT_DEPTH) {
2467
2468       target = V_008DFC_SQ_EXP_MRTZ;
2469       enabled_channels = 0x1;
2470       col_format = (unsigned) -1;
2471
2472    } else if (index == FRAG_RESULT_STENCIL) {
2473
2474       if (ctx->program->info->ps.writes_z) {
2475          target = V_008DFC_SQ_EXP_MRTZ;
2476          enabled_channels = 0x2;
2477          col_format = (unsigned) -1;
2478
2479          values[1] = values[0];
2480          values[0] = Operand(v1);
2481       } else {
2482          aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2483          shift->operands[0] = Operand((uint32_t) 16);
2484          shift->operands[1] = values[0];
2485          Temp tmp = {ctx->program->allocateId(), v1};
2486          shift->definitions[0] = Definition(tmp);
2487          ctx->block->instructions.emplace_back(std::move(shift));
2488
2489          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2490          exp->valid_mask = false;
2491          exp->done = false;
2492          exp->compressed = true;
2493          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2494          exp->enabled_mask = 0x3;
2495          exp->operands[0] = Operand(tmp);
2496          for (int i = 1; i < 4; i++)
2497             exp->operands[i] = Operand(v1);
2498          ctx->block->instructions.emplace_back(std::move(exp));
2499          return;
2500       }
2501
2502    } else {
2503       index -= FRAG_RESULT_DATA0;
2504       target = V_008DFC_SQ_EXP_MRT + index;
2505       col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2506    }
2507    ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2508    ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2509    assert(!is_int8 && !is_int10);
2510
2511    switch (col_format)
2512    {
2513    case V_028714_SPI_SHADER_ZERO:
2514       enabled_channels = 0; /* writemask */
2515       target = V_008DFC_SQ_EXP_NULL;
2516       break;
2517
2518    case V_028714_SPI_SHADER_32_R:
2519       enabled_channels = 1;
2520       break;
2521
2522    case V_028714_SPI_SHADER_32_GR:
2523       enabled_channels = 0x3;
2524       break;
2525
2526    case V_028714_SPI_SHADER_32_AR:
2527       enabled_channels = 0x9;
2528       break;
2529
2530    case V_028714_SPI_SHADER_FP16_ABGR:
2531       enabled_channels = 0x5;
2532       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2533       break;
2534
2535    case V_028714_SPI_SHADER_UNORM16_ABGR:
2536       enabled_channels = 0x5;
2537       compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2538       break;
2539
2540    case V_028714_SPI_SHADER_SNORM16_ABGR:
2541       enabled_channels = 0x5;
2542       compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2543       break;
2544
2545    case V_028714_SPI_SHADER_UINT16_ABGR:
2546       enabled_channels = 0x5;
2547       compr_op = aco_opcode::v_cvt_pk_u16_u32;
2548       break;
2549
2550    case V_028714_SPI_SHADER_SINT16_ABGR:
2551       enabled_channels = 0x5;
2552       compr_op = aco_opcode::v_cvt_pk_i16_i32;
2553       break;
2554
2555    case V_028714_SPI_SHADER_32_ABGR:
2556       enabled_channels = 0xF;
2557       break;
2558
2559    default:
2560       break;
2561    }
2562
2563    if (target == V_008DFC_SQ_EXP_NULL)
2564       return;
2565
2566    if ((bool)compr_op)
2567    {
2568       for (int i = 0; i < 2; i++)
2569       {
2570          /* check if at least one of the values to be compressed is enabled */
2571          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2572          if (enabled) {
2573             enabled_channels |= enabled << (i*2);
2574             aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2575             Temp tmp{ctx->program->allocateId(), v1};
2576             compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2577             compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2578             compr->definitions[0] = Definition(tmp);
2579             values[i] = Operand(tmp);
2580             ctx->block->instructions.emplace_back(std::move(compr));
2581          } else {
2582             values[i] = Operand(v1);
2583          }
2584       }
2585    }
2586
2587    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2588    exp->valid_mask = false;
2589    exp->done = false;
2590    exp->compressed = (bool) compr_op;
2591    exp->dest = target;
2592    exp->enabled_mask = enabled_channels;
2593    if ((bool) compr_op) {
2594       for (int i = 0; i < 2; i++)
2595          exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2596       exp->operands[2] = Operand(v1);
2597       exp->operands[3] = Operand(v1);
2598    } else {
2599       for (int i = 0; i < 4; i++)
2600          exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2601    }
2602
2603    ctx->block->instructions.emplace_back(std::move(exp));
2604 }
2605
2606 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2607 {
2608    if (ctx->stage == vertex_vs) {
2609       visit_store_vs_output(ctx, instr);
2610    } else if (ctx->stage == fragment_fs) {
2611       visit_store_fs_output(ctx, instr);
2612    } else {
2613       unreachable("Shader stage not implemented");
2614    }
2615 }
2616
2617 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2618 {
2619    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2620    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2621
2622    Builder bld(ctx->program, ctx->block);
2623    Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2624    bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2625 }
2626
2627 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2628 {
2629    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2630    for (unsigned i = 0; i < num_components; i++)
2631       vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
2632
2633    if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
2634       assert(num_components == 4);
2635       Builder bld(ctx->program, ctx->block);
2636       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
2637    }
2638
2639    for (Operand& op : vec->operands)
2640       op = op.isUndefined() ? Operand(0u) : op;
2641
2642    vec->definitions[0] = Definition(dst);
2643    ctx->block->instructions.emplace_back(std::move(vec));
2644    emit_split_vector(ctx, dst, num_components);
2645    return;
2646 }
2647
2648 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2649 {
2650    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2651    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2652    unsigned idx = nir_intrinsic_base(instr);
2653    unsigned component = nir_intrinsic_component(instr);
2654    Temp prim_mask = ctx->prim_mask;
2655
2656    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2657    if (offset) {
2658       assert(offset->u32 == 0);
2659    } else {
2660       /* the lower 15bit of the prim_mask contain the offset into LDS
2661        * while the upper bits contain the number of prims */
2662       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2663       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2664       Builder bld(ctx->program, ctx->block);
2665       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2666       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2667       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2668       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2669       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2670    }
2671
2672    if (instr->dest.ssa.num_components == 1) {
2673       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2674    } else {
2675       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2676       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2677       {
2678          Temp tmp = {ctx->program->allocateId(), v1};
2679          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2680          vec->operands[i] = Operand(tmp);
2681       }
2682       vec->definitions[0] = Definition(dst);
2683       ctx->block->instructions.emplace_back(std::move(vec));
2684    }
2685 }
2686
2687 unsigned get_num_channels_from_data_format(unsigned data_format)
2688 {
2689    switch (data_format) {
2690    case V_008F0C_BUF_DATA_FORMAT_8:
2691    case V_008F0C_BUF_DATA_FORMAT_16:
2692    case V_008F0C_BUF_DATA_FORMAT_32:
2693       return 1;
2694    case V_008F0C_BUF_DATA_FORMAT_8_8:
2695    case V_008F0C_BUF_DATA_FORMAT_16_16:
2696    case V_008F0C_BUF_DATA_FORMAT_32_32:
2697       return 2;
2698    case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2699    case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2700    case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2701       return 3;
2702    case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2703    case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2704    case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2705    case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2706    case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2707       return 4;
2708    default:
2709       break;
2710    }
2711
2712    return 4;
2713 }
2714
2715 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2716  * so we may need to fix it up. */
2717 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
2718 {
2719    Builder bld(ctx->program, ctx->block);
2720
2721    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
2722       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
2723
2724    /* For the integer-like cases, do a natural sign extension.
2725     *
2726     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2727     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2728     * exponent.
2729     */
2730    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
2731    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
2732
2733    /* Convert back to the right type. */
2734    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
2735       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2736       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
2737       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
2738    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
2739       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2740    }
2741
2742    return alpha;
2743 }
2744
2745 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
2746 {
2747    Builder bld(ctx->program, ctx->block);
2748    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2749    if (ctx->stage & sw_vs) {
2750
2751       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2752       if (off_instr->type != nir_instr_type_load_const) {
2753          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2754          nir_print_instr(off_instr, stderr);
2755          fprintf(stderr, "\n");
2756       }
2757       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
2758
2759       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
2760
2761       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
2762       unsigned component = nir_intrinsic_component(instr);
2763       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
2764       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
2765       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
2766       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
2767
2768       unsigned dfmt = attrib_format & 0xf;
2769
2770       unsigned nfmt = (attrib_format >> 4) & 0x7;
2771       unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
2772       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
2773       unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
2774       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
2775       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
2776       if (post_shuffle)
2777          num_channels = MAX2(num_channels, 3);
2778
2779       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
2780
2781       Temp index;
2782       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
2783          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
2784          if (divisor) {
2785             ctx->needs_instance_id = true;
2786
2787             if (divisor != 1) {
2788                Temp divided = bld.tmp(v1);
2789                emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
2790                index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
2791             } else {
2792                index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
2793             }
2794          } else {
2795             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
2796          }
2797       } else {
2798          index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
2799       }
2800
2801       if (attrib_stride != 0 && attrib_offset > attrib_stride) {
2802          index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
2803          attrib_offset = attrib_offset % attrib_stride;
2804       }
2805
2806       Operand soffset(0u);
2807       if (attrib_offset >= 4096) {
2808          soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
2809          attrib_offset = 0;
2810       }
2811
2812       aco_opcode opcode;
2813       switch (num_channels) {
2814       case 1:
2815          opcode = aco_opcode::tbuffer_load_format_x;
2816          break;
2817       case 2:
2818          opcode = aco_opcode::tbuffer_load_format_xy;
2819          break;
2820       case 3:
2821          opcode = aco_opcode::tbuffer_load_format_xyz;
2822          break;
2823       case 4:
2824          opcode = aco_opcode::tbuffer_load_format_xyzw;
2825          break;
2826       default:
2827          unreachable("Unimplemented load_input vector size");
2828       }
2829
2830       Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
2831
2832       aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
2833       mubuf->operands[0] = Operand(index);
2834       mubuf->operands[1] = Operand(list);
2835       mubuf->operands[2] = soffset;
2836       mubuf->definitions[0] = Definition(tmp);
2837       mubuf->idxen = true;
2838       mubuf->can_reorder = true;
2839       mubuf->dfmt = dfmt;
2840       mubuf->nfmt = nfmt;
2841       assert(attrib_offset < 4096);
2842       mubuf->offset = attrib_offset;
2843       ctx->block->instructions.emplace_back(std::move(mubuf));
2844
2845       emit_split_vector(ctx, tmp, tmp.size());
2846
2847       if (tmp.id() != dst.id()) {
2848          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
2849                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
2850
2851          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
2852          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
2853          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
2854
2855          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2856          for (unsigned i = 0; i < dst.size(); i++) {
2857             unsigned idx = i + component;
2858             if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
2859                Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
2860                vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
2861             } else if (idx < num_channels) {
2862                vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
2863             } else if (is_float && idx == 3) {
2864                vec->operands[i] = Operand(0x3f800000u);
2865             } else if (!is_float && idx == 3) {
2866                vec->operands[i] = Operand(1u);
2867             } else {
2868                vec->operands[i] = Operand(0u);
2869             }
2870          }
2871          vec->definitions[0] = Definition(dst);
2872          ctx->block->instructions.emplace_back(std::move(vec));
2873          emit_split_vector(ctx, dst, dst.size());
2874       }
2875
2876    } else if (ctx->stage == fragment_fs) {
2877       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2878       if (off_instr->type != nir_instr_type_load_const ||
2879           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
2880          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2881          nir_print_instr(off_instr, stderr);
2882          fprintf(stderr, "\n");
2883       }
2884
2885       Temp prim_mask = ctx->prim_mask;
2886       nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
2887       if (offset) {
2888          assert(offset->u32 == 0);
2889       } else {
2890          /* the lower 15bit of the prim_mask contain the offset into LDS
2891           * while the upper bits contain the number of prims */
2892          Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
2893          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2894          Builder bld(ctx->program, ctx->block);
2895          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2896          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2897          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2898          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2899          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2900       }
2901
2902       unsigned idx = nir_intrinsic_base(instr);
2903       unsigned component = nir_intrinsic_component(instr);
2904
2905       if (dst.size() == 1) {
2906          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
2907       } else {
2908          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2909          for (unsigned i = 0; i < dst.size(); i++)
2910             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
2911          vec->definitions[0] = Definition(dst);
2912          bld.insert(std::move(vec));
2913       }
2914
2915    } else {
2916       unreachable("Shader stage not implemented");
2917    }
2918 }
2919
2920 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
2921 {
2922    if (ctx->program->info->need_indirect_descriptor_sets) {
2923       Builder bld(ctx->program, ctx->block);
2924       Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
2925       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
2926    }
2927
2928    return ctx->descriptor_sets[desc_set];
2929 }
2930
2931
2932 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
2933 {
2934    Builder bld(ctx->program, ctx->block);
2935    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
2936    if (!ctx->divergent_vals[instr->dest.ssa.index])
2937       index = bld.as_uniform(index);
2938    unsigned desc_set = nir_intrinsic_desc_set(instr);
2939    unsigned binding = nir_intrinsic_binding(instr);
2940
2941    Temp desc_ptr;
2942    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
2943    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
2944    unsigned offset = layout->binding[binding].offset;
2945    unsigned stride;
2946    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
2947        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
2948       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
2949       desc_ptr = ctx->push_constants;
2950       offset = pipeline_layout->push_constant_size + 16 * idx;
2951       stride = 16;
2952    } else {
2953       desc_ptr = load_desc_ptr(ctx, desc_set);
2954       stride = layout->binding[binding].size;
2955    }
2956
2957    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
2958    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
2959    if (stride != 1) {
2960       if (nir_const_index) {
2961          const_index = const_index * stride;
2962       } else if (index.type() == RegType::vgpr) {
2963          bool index24bit = layout->binding[binding].array_size <= 0x1000000;
2964          index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit);
2965       } else {
2966          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
2967       }
2968    }
2969    if (offset) {
2970       if (nir_const_index) {
2971          const_index = const_index + offset;
2972       } else if (index.type() == RegType::vgpr) {
2973          index = bld.vadd32(bld.def(v1), Operand(offset), index);
2974       } else {
2975          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
2976       }
2977    }
2978
2979    if (nir_const_index && const_index == 0) {
2980       index = desc_ptr;
2981    } else if (index.type() == RegType::vgpr) {
2982       index = bld.vadd32(bld.def(v1),
2983                          nir_const_index ? Operand(const_index) : Operand(index),
2984                          Operand(desc_ptr));
2985    } else {
2986       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
2987                        nir_const_index ? Operand(const_index) : Operand(index),
2988                        Operand(desc_ptr));
2989    }
2990
2991    bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), index);
2992 }
2993
2994 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false)
2995 {
2996    Builder bld(ctx->program, ctx->block);
2997
2998    unsigned num_bytes = dst.size() * 4;
2999    bool dlc = glc && ctx->options->chip_class >= GFX10;
3000
3001    aco_opcode op;
3002    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
3003       if (ctx->options->chip_class < GFX8)
3004          offset = as_vgpr(ctx, offset);
3005
3006       Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3007       Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3008       unsigned const_offset = 0;
3009
3010       Temp lower = Temp();
3011       if (num_bytes > 16) {
3012          assert(num_components == 3 || num_components == 4);
3013          op = aco_opcode::buffer_load_dwordx4;
3014          lower = bld.tmp(v4);
3015          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3016          mubuf->definitions[0] = Definition(lower);
3017          mubuf->operands[0] = vaddr;
3018          mubuf->operands[1] = Operand(rsrc);
3019          mubuf->operands[2] = soffset;
3020          mubuf->offen = (offset.type() == RegType::vgpr);
3021          mubuf->glc = glc;
3022          mubuf->dlc = dlc;
3023          mubuf->barrier = barrier_buffer;
3024          bld.insert(std::move(mubuf));
3025          emit_split_vector(ctx, lower, 2);
3026          num_bytes -= 16;
3027          const_offset = 16;
3028       }
3029
3030       switch (num_bytes) {
3031          case 4:
3032             op = aco_opcode::buffer_load_dword;
3033             break;
3034          case 8:
3035             op = aco_opcode::buffer_load_dwordx2;
3036             break;
3037          case 12:
3038             op = aco_opcode::buffer_load_dwordx3;
3039             break;
3040          case 16:
3041             op = aco_opcode::buffer_load_dwordx4;
3042             break;
3043          default:
3044             unreachable("Load SSBO not implemented for this size.");
3045       }
3046       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3047       mubuf->operands[0] = vaddr;
3048       mubuf->operands[1] = Operand(rsrc);
3049       mubuf->operands[2] = soffset;
3050       mubuf->offen = (offset.type() == RegType::vgpr);
3051       mubuf->glc = glc;
3052       mubuf->dlc = dlc;
3053       mubuf->barrier = barrier_buffer;
3054       mubuf->offset = const_offset;
3055       aco_ptr<Instruction> instr = std::move(mubuf);
3056
3057       if (dst.size() > 4) {
3058          assert(lower != Temp());
3059          Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3060          instr->definitions[0] = Definition(upper);
3061          bld.insert(std::move(instr));
3062          if (dst.size() == 8)
3063             emit_split_vector(ctx, upper, 2);
3064          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3065          instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3066          instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3067          instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3068          if (dst.size() == 8)
3069             instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3070       }
3071
3072       if (dst.type() == RegType::sgpr) {
3073          Temp vec = bld.tmp(RegType::vgpr, dst.size());
3074          instr->definitions[0] = Definition(vec);
3075          bld.insert(std::move(instr));
3076          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3077       } else {
3078          instr->definitions[0] = Definition(dst);
3079          bld.insert(std::move(instr));
3080       }
3081    } else {
3082       switch (num_bytes) {
3083          case 4:
3084             op = aco_opcode::s_buffer_load_dword;
3085             break;
3086          case 8:
3087             op = aco_opcode::s_buffer_load_dwordx2;
3088             break;
3089          case 12:
3090          case 16:
3091             op = aco_opcode::s_buffer_load_dwordx4;
3092             break;
3093          case 24:
3094          case 32:
3095             op = aco_opcode::s_buffer_load_dwordx8;
3096             break;
3097          default:
3098             unreachable("Load SSBO not implemented for this size.");
3099       }
3100       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3101       load->operands[0] = Operand(rsrc);
3102       load->operands[1] = Operand(bld.as_uniform(offset));
3103       assert(load->operands[1].getTemp().type() == RegType::sgpr);
3104       load->definitions[0] = Definition(dst);
3105       load->glc = glc;
3106       load->dlc = dlc;
3107       load->barrier = barrier_buffer;
3108       assert(ctx->options->chip_class >= GFX8 || !glc);
3109
3110       /* trim vector */
3111       if (dst.size() == 3) {
3112          Temp vec = bld.tmp(s4);
3113          load->definitions[0] = Definition(vec);
3114          bld.insert(std::move(load));
3115          emit_split_vector(ctx, vec, 4);
3116
3117          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3118                     emit_extract_vector(ctx, vec, 0, s1),
3119                     emit_extract_vector(ctx, vec, 1, s1),
3120                     emit_extract_vector(ctx, vec, 2, s1));
3121       } else if (dst.size() == 6) {
3122          Temp vec = bld.tmp(s8);
3123          load->definitions[0] = Definition(vec);
3124          bld.insert(std::move(load));
3125          emit_split_vector(ctx, vec, 4);
3126
3127          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3128                     emit_extract_vector(ctx, vec, 0, s2),
3129                     emit_extract_vector(ctx, vec, 1, s2),
3130                     emit_extract_vector(ctx, vec, 2, s2));
3131       } else {
3132          bld.insert(std::move(load));
3133       }
3134
3135    }
3136    emit_split_vector(ctx, dst, num_components);
3137 }
3138
3139 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3140 {
3141    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3142    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3143
3144    Builder bld(ctx->program, ctx->block);
3145
3146    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3147    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3148    unsigned binding = nir_intrinsic_binding(idx_instr);
3149    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3150
3151    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3152       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3153                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3154                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3155                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3156       if (ctx->options->chip_class >= GFX10) {
3157          desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3158                       S_008F0C_OOB_SELECT(3) |
3159                       S_008F0C_RESOURCE_LEVEL(1);
3160       } else {
3161          desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3162                       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3163       }
3164       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3165                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3166                                      Operand(0xFFFFFFFFu),
3167                                      Operand(desc_type));
3168       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3169                         rsrc, upper_dwords);
3170    } else {
3171       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3172       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3173    }
3174
3175    load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3176 }
3177
3178 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3179 {
3180    Builder bld(ctx->program, ctx->block);
3181    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3182
3183    unsigned offset = nir_intrinsic_base(instr);
3184    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3185    if (index_cv && instr->dest.ssa.bit_size == 32) {
3186
3187       unsigned count = instr->dest.ssa.num_components;
3188       unsigned start = (offset + index_cv->u32) / 4u;
3189       start -= ctx->base_inline_push_consts;
3190       if (start + count <= ctx->num_inline_push_consts) {
3191          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3192          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3193          for (unsigned i = 0; i < count; ++i) {
3194             elems[i] = ctx->inline_push_consts[start + i];
3195             vec->operands[i] = Operand{elems[i]};
3196          }
3197          vec->definitions[0] = Definition(dst);
3198          ctx->block->instructions.emplace_back(std::move(vec));
3199          ctx->allocated_vec.emplace(dst.id(), elems);
3200          return;
3201       }
3202    }
3203
3204    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3205    if (offset != 0) // TODO check if index != 0 as well
3206       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3207    Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
3208    Temp vec = dst;
3209    bool trim = false;
3210    aco_opcode op;
3211
3212    switch (dst.size()) {
3213    case 1:
3214       op = aco_opcode::s_load_dword;
3215       break;
3216    case 2:
3217       op = aco_opcode::s_load_dwordx2;
3218       break;
3219    case 3:
3220       vec = bld.tmp(s4);
3221       trim = true;
3222    case 4:
3223       op = aco_opcode::s_load_dwordx4;
3224       break;
3225    case 6:
3226       vec = bld.tmp(s8);
3227       trim = true;
3228    case 8:
3229       op = aco_opcode::s_load_dwordx8;
3230       break;
3231    default:
3232       unreachable("unimplemented or forbidden load_push_constant.");
3233    }
3234
3235    bld.smem(op, Definition(vec), ptr, index);
3236
3237    if (trim) {
3238       emit_split_vector(ctx, vec, 4);
3239       RegClass rc = dst.size() == 3 ? s1 : s2;
3240       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3241                  emit_extract_vector(ctx, vec, 0, rc),
3242                  emit_extract_vector(ctx, vec, 1, rc),
3243                  emit_extract_vector(ctx, vec, 2, rc));
3244
3245    }
3246    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3247 }
3248
3249 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3250 {
3251    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3252
3253    Builder bld(ctx->program, ctx->block);
3254
3255    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3256                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3257                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3258                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3259    if (ctx->options->chip_class >= GFX10) {
3260       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3261                    S_008F0C_OOB_SELECT(3) |
3262                    S_008F0C_RESOURCE_LEVEL(1);
3263    } else {
3264       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3265                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3266    }
3267
3268    unsigned base = nir_intrinsic_base(instr);
3269    unsigned range = nir_intrinsic_range(instr);
3270
3271    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3272    if (base && offset.type() == RegType::sgpr)
3273       offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3274    else if (base && offset.type() == RegType::vgpr)
3275       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3276
3277    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3278                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(ctx->constant_data_offset)),
3279                           Operand(MIN2(base + range, ctx->shader->constant_data_size)),
3280                           Operand(desc_type));
3281
3282    load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3283 }
3284
3285 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3286 {
3287    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3288       ctx->cf_info.exec_potentially_empty = true;
3289
3290    ctx->program->needs_exact = true;
3291
3292    // TODO: optimize uniform conditions
3293    Builder bld(ctx->program, ctx->block);
3294    Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
3295    src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3296    bld.pseudo(aco_opcode::p_discard_if, src);
3297    ctx->block->kind |= block_kind_uses_discard_if;
3298    return;
3299 }
3300
3301 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3302 {
3303    Builder bld(ctx->program, ctx->block);
3304
3305    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3306       ctx->cf_info.exec_potentially_empty = true;
3307
3308    bool divergent = ctx->cf_info.parent_if.is_divergent ||
3309                     ctx->cf_info.parent_loop.has_divergent_continue;
3310
3311    if (ctx->block->loop_nest_depth &&
3312        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3313       /* we handle discards the same way as jump instructions */
3314       append_logical_end(ctx->block);
3315
3316       /* in loops, discard behaves like break */
3317       Block *linear_target = ctx->cf_info.parent_loop.exit;
3318       ctx->block->kind |= block_kind_discard;
3319
3320       if (!divergent) {
3321          /* uniform discard - loop ends here */
3322          assert(nir_instr_is_last(&instr->instr));
3323          ctx->block->kind |= block_kind_uniform;
3324          ctx->cf_info.has_branch = true;
3325          bld.branch(aco_opcode::p_branch);
3326          add_linear_edge(ctx->block->index, linear_target);
3327          return;
3328       }
3329
3330       /* we add a break right behind the discard() instructions */
3331       ctx->block->kind |= block_kind_break;
3332       unsigned idx = ctx->block->index;
3333
3334       /* remove critical edges from linear CFG */
3335       bld.branch(aco_opcode::p_branch);
3336       Block* break_block = ctx->program->create_and_insert_block();
3337       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3338       break_block->kind |= block_kind_uniform;
3339       add_linear_edge(idx, break_block);
3340       add_linear_edge(break_block->index, linear_target);
3341       bld.reset(break_block);
3342       bld.branch(aco_opcode::p_branch);
3343
3344       Block* continue_block = ctx->program->create_and_insert_block();
3345       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3346       add_linear_edge(idx, continue_block);
3347       append_logical_start(continue_block);
3348       ctx->block = continue_block;
3349
3350       return;
3351    }
3352
3353    /* it can currently happen that NIR doesn't remove the unreachable code */
3354    if (!nir_instr_is_last(&instr->instr)) {
3355       ctx->program->needs_exact = true;
3356       /* save exec somewhere temporarily so that it doesn't get
3357        * overwritten before the discard from outer exec masks */
3358       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3359       bld.pseudo(aco_opcode::p_discard_if, cond);
3360       ctx->block->kind |= block_kind_uses_discard_if;
3361       return;
3362    }
3363
3364    /* This condition is incorrect for uniformly branched discards in a loop
3365     * predicated by a divergent condition, but the above code catches that case
3366     * and the discard would end up turning into a discard_if.
3367     * For example:
3368     * if (divergent) {
3369     *    while (...) {
3370     *       if (uniform) {
3371     *          discard;
3372     *       }
3373     *    }
3374     * }
3375     */
3376    if (!ctx->cf_info.parent_if.is_divergent) {
3377       /* program just ends here */
3378       ctx->block->kind |= block_kind_uniform;
3379       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3380               0 /* enabled mask */, 9 /* dest */,
3381               false /* compressed */, true/* done */, true /* valid mask */);
3382       bld.sopp(aco_opcode::s_endpgm);
3383       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3384    } else {
3385       ctx->block->kind |= block_kind_discard;
3386       /* branch and linear edge is added by visit_if() */
3387    }
3388 }
3389
3390 enum aco_descriptor_type {
3391    ACO_DESC_IMAGE,
3392    ACO_DESC_FMASK,
3393    ACO_DESC_SAMPLER,
3394    ACO_DESC_BUFFER,
3395    ACO_DESC_PLANE_0,
3396    ACO_DESC_PLANE_1,
3397    ACO_DESC_PLANE_2,
3398 };
3399
3400 static bool
3401 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3402    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3403       return false;
3404    ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array);
3405    return dim == ac_image_cube ||
3406           dim == ac_image_1darray ||
3407           dim == ac_image_2darray ||
3408           dim == ac_image_2darraymsaa;
3409 }
3410
3411 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3412                       enum aco_descriptor_type desc_type,
3413                       const nir_tex_instr *tex_instr, bool image, bool write)
3414 {
3415 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3416    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3417    if (it != ctx->tex_desc.end())
3418       return it->second;
3419 */
3420    Temp index = Temp();
3421    bool index_set = false;
3422    unsigned constant_index = 0;
3423    unsigned descriptor_set;
3424    unsigned base_index;
3425    Builder bld(ctx->program, ctx->block);
3426
3427    if (!deref_instr) {
3428       assert(tex_instr && !image);
3429       descriptor_set = 0;
3430       base_index = tex_instr->sampler_index;
3431    } else {
3432       while(deref_instr->deref_type != nir_deref_type_var) {
3433          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3434          if (!array_size)
3435             array_size = 1;
3436
3437          assert(deref_instr->deref_type == nir_deref_type_array);
3438          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3439          if (const_value) {
3440             constant_index += array_size * const_value->u32;
3441          } else {
3442             Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa);
3443             if (indirect.type() == RegType::vgpr)
3444                indirect = bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), indirect);
3445
3446             if (array_size != 1)
3447                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3448
3449             if (!index_set) {
3450                index = indirect;
3451                index_set = true;
3452             } else {
3453                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3454             }
3455          }
3456
3457          deref_instr = nir_src_as_deref(deref_instr->parent);
3458       }
3459       descriptor_set = deref_instr->var->data.descriptor_set;
3460       base_index = deref_instr->var->data.binding;
3461    }
3462
3463    Temp list = load_desc_ptr(ctx, descriptor_set);
3464    list = convert_pointer_to_64_bit(ctx, list);
3465
3466    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3467    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3468    unsigned offset = binding->offset;
3469    unsigned stride = binding->size;
3470    aco_opcode opcode;
3471    RegClass type;
3472
3473    assert(base_index < layout->binding_count);
3474
3475    switch (desc_type) {
3476    case ACO_DESC_IMAGE:
3477       type = s8;
3478       opcode = aco_opcode::s_load_dwordx8;
3479       break;
3480    case ACO_DESC_FMASK:
3481       type = s8;
3482       opcode = aco_opcode::s_load_dwordx8;
3483       offset += 32;
3484       break;
3485    case ACO_DESC_SAMPLER:
3486       type = s4;
3487       opcode = aco_opcode::s_load_dwordx4;
3488       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3489          offset += radv_combined_image_descriptor_sampler_offset(binding);
3490       break;
3491    case ACO_DESC_BUFFER:
3492       type = s4;
3493       opcode = aco_opcode::s_load_dwordx4;
3494       break;
3495    case ACO_DESC_PLANE_0:
3496    case ACO_DESC_PLANE_1:
3497       type = s8;
3498       opcode = aco_opcode::s_load_dwordx8;
3499       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3500       break;
3501    case ACO_DESC_PLANE_2:
3502       type = s4;
3503       opcode = aco_opcode::s_load_dwordx4;
3504       offset += 64;
3505       break;
3506    default:
3507       unreachable("invalid desc_type\n");
3508    }
3509
3510    offset += constant_index * stride;
3511
3512    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3513       (!index_set || binding->immutable_samplers_equal)) {
3514       if (binding->immutable_samplers_equal)
3515          constant_index = 0;
3516
3517       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3518       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3519                         Operand(samplers[constant_index * 4 + 0]),
3520                         Operand(samplers[constant_index * 4 + 1]),
3521                         Operand(samplers[constant_index * 4 + 2]),
3522                         Operand(samplers[constant_index * 4 + 3]));
3523    }
3524
3525    Operand off;
3526    if (!index_set) {
3527       off = Operand(offset);
3528    } else {
3529       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3530                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3531    }
3532
3533    Temp res = bld.smem(opcode, bld.def(type), list, off);
3534
3535    if (desc_type == ACO_DESC_PLANE_2) {
3536       Temp components[8];
3537       for (unsigned i = 0; i < 8; i++)
3538          components[i] = bld.tmp(s1);
3539       bld.pseudo(aco_opcode::p_split_vector,
3540                  Definition(components[0]),
3541                  Definition(components[1]),
3542                  Definition(components[2]),
3543                  Definition(components[3]),
3544                  res);
3545
3546       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3547       bld.pseudo(aco_opcode::p_split_vector,
3548                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3549                  Definition(components[4]),
3550                  Definition(components[5]),
3551                  Definition(components[6]),
3552                  Definition(components[7]),
3553                  desc2);
3554
3555       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3556                        components[0], components[1], components[2], components[3],
3557                        components[4], components[5], components[6], components[7]);
3558    }
3559
3560    return res;
3561 }
3562
3563 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3564 {
3565    switch (dim) {
3566    case GLSL_SAMPLER_DIM_BUF:
3567       return 1;
3568    case GLSL_SAMPLER_DIM_1D:
3569       return array ? 2 : 1;
3570    case GLSL_SAMPLER_DIM_2D:
3571       return array ? 3 : 2;
3572    case GLSL_SAMPLER_DIM_MS:
3573       return array ? 4 : 3;
3574    case GLSL_SAMPLER_DIM_3D:
3575    case GLSL_SAMPLER_DIM_CUBE:
3576       return 3;
3577    case GLSL_SAMPLER_DIM_RECT:
3578    case GLSL_SAMPLER_DIM_SUBPASS:
3579       return 2;
3580    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3581       return 3;
3582    default:
3583       break;
3584    }
3585    return 0;
3586 }
3587
3588
3589 /* Adjust the sample index according to FMASK.
3590  *
3591  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3592  * which is the identity mapping. Each nibble says which physical sample
3593  * should be fetched to get that sample.
3594  *
3595  * For example, 0x11111100 means there are only 2 samples stored and
3596  * the second sample covers 3/4 of the pixel. When reading samples 0
3597  * and 1, return physical sample 0 (determined by the first two 0s
3598  * in FMASK), otherwise return physical sample 1.
3599  *
3600  * The sample index should be adjusted as follows:
3601  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
3602  */
3603 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3604 {
3605    Builder bld(ctx->program, ctx->block);
3606    Temp fmask = bld.tmp(v1);
3607    unsigned dim = ctx->options->chip_class >= GFX10
3608                   ? ac_get_sampler_dim(ctx->options->chip_class, GLSL_SAMPLER_DIM_2D, da)
3609                   : 0;
3610
3611    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3612    load->operands[0] = Operand(coords);
3613    load->operands[1] = Operand(fmask_desc_ptr);
3614    load->definitions[0] = Definition(fmask);
3615    load->glc = false;
3616    load->dlc = false;
3617    load->dmask = 0x1;
3618    load->unrm = true;
3619    load->da = da;
3620    load->dim = dim;
3621    load->can_reorder = true; /* fmask images shouldn't be modified */
3622    ctx->block->instructions.emplace_back(std::move(load));
3623
3624    Operand sample_index4;
3625    if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3626       sample_index4 = Operand(sample_index.constantValue() << 2);
3627    } else if (sample_index.regClass() == s1) {
3628       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3629    } else {
3630       assert(sample_index.regClass() == v1);
3631       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3632    }
3633
3634    Temp final_sample;
3635    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3636       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3637    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3638       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3639    else
3640       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3641
3642    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3643     * resource descriptor is 0 (invalid),
3644     */
3645    Temp compare = bld.tmp(s2);
3646    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3647                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3648
3649    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3650
3651    /* Replace the MSAA sample index. */
3652    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3653 }
3654
3655 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3656 {
3657
3658    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3659    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3660    bool is_array = glsl_sampler_type_is_array(type);
3661    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3662    assert(!add_frag_pos && "Input attachments should be lowered.");
3663    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3664    bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3665    int count = image_type_to_components_count(dim, is_array);
3666    std::vector<Operand> coords(count);
3667
3668    if (is_ms) {
3669       Operand sample_index;
3670       nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3671       if (sample_cv)
3672          sample_index = Operand(sample_cv->u32);
3673       else
3674          sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3675
3676       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3677          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3678          for (unsigned i = 0; i < vec->operands.size(); i++)
3679             vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3680          Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3681          vec->definitions[0] = Definition(fmask_load_address);
3682          ctx->block->instructions.emplace_back(std::move(vec));
3683
3684          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3685          sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3686       }
3687       count--;
3688       coords[count] = sample_index;
3689    }
3690
3691    if (count == 1 && !gfx9_1d)
3692       return emit_extract_vector(ctx, src0, 0, v1);
3693
3694    if (gfx9_1d) {
3695       coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3696       coords.resize(coords.size() + 1);
3697       coords[1] = Operand((uint32_t) 0);
3698       if (is_array)
3699          coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3700    } else {
3701       for (int i = 0; i < count; i++)
3702          coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3703    }
3704
3705    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
3706    for (unsigned i = 0; i < coords.size(); i++)
3707       vec->operands[i] = coords[i];
3708    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
3709    vec->definitions[0] = Definition(res);
3710    ctx->block->instructions.emplace_back(std::move(vec));
3711    return res;
3712 }
3713
3714
3715 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
3716 {
3717    Builder bld(ctx->program, ctx->block);
3718    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3719    const struct glsl_type *type = glsl_without_array(var->type);
3720    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3721    bool is_array = glsl_sampler_type_is_array(type);
3722    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3723
3724    if (dim == GLSL_SAMPLER_DIM_BUF) {
3725       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
3726       unsigned num_channels = util_last_bit(mask);
3727       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3728       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3729
3730       aco_opcode opcode;
3731       switch (num_channels) {
3732       case 1:
3733          opcode = aco_opcode::buffer_load_format_x;
3734          break;
3735       case 2:
3736          opcode = aco_opcode::buffer_load_format_xy;
3737          break;
3738       case 3:
3739          opcode = aco_opcode::buffer_load_format_xyz;
3740          break;
3741       case 4:
3742          opcode = aco_opcode::buffer_load_format_xyzw;
3743          break;
3744       default:
3745          unreachable(">4 channel buffer image load");
3746       }
3747       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
3748       load->operands[0] = Operand(vindex);
3749       load->operands[1] = Operand(rsrc);
3750       load->operands[2] = Operand((uint32_t) 0);
3751       Temp tmp;
3752       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3753          tmp = dst;
3754       else
3755          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
3756       load->definitions[0] = Definition(tmp);
3757       load->idxen = true;
3758       load->barrier = barrier_image;
3759       ctx->block->instructions.emplace_back(std::move(load));
3760
3761       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
3762       return;
3763    }
3764
3765    Temp coords = get_image_coords(ctx, instr, type);
3766    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3767
3768    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
3769    unsigned num_components = util_bitcount(dmask);
3770    Temp tmp;
3771    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3772       tmp = dst;
3773    else
3774       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
3775
3776    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3777    load->operands[0] = Operand(coords);
3778    load->operands[1] = Operand(resource);
3779    load->definitions[0] = Definition(tmp);
3780    load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
3781    load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3782    load->dmask = dmask;
3783    load->unrm = true;
3784    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3785    load->barrier = barrier_image;
3786    ctx->block->instructions.emplace_back(std::move(load));
3787
3788    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
3789    return;
3790 }
3791
3792 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
3793 {
3794    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3795    const struct glsl_type *type = glsl_without_array(var->type);
3796    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3797    bool is_array = glsl_sampler_type_is_array(type);
3798    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3799
3800    bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
3801
3802    if (dim == GLSL_SAMPLER_DIM_BUF) {
3803       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3804       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3805       aco_opcode opcode;
3806       switch (data.size()) {
3807       case 1:
3808          opcode = aco_opcode::buffer_store_format_x;
3809          break;
3810       case 2:
3811          opcode = aco_opcode::buffer_store_format_xy;
3812          break;
3813       case 3:
3814          opcode = aco_opcode::buffer_store_format_xyz;
3815          break;
3816       case 4:
3817          opcode = aco_opcode::buffer_store_format_xyzw;
3818          break;
3819       default:
3820          unreachable(">4 channel buffer image store");
3821       }
3822       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
3823       store->operands[0] = Operand(vindex);
3824       store->operands[1] = Operand(rsrc);
3825       store->operands[2] = Operand((uint32_t) 0);
3826       store->operands[3] = Operand(data);
3827       store->idxen = true;
3828       store->glc = glc;
3829       store->dlc = false;
3830       store->disable_wqm = true;
3831       store->barrier = barrier_image;
3832       ctx->program->needs_exact = true;
3833       ctx->block->instructions.emplace_back(std::move(store));
3834       return;
3835    }
3836
3837    assert(data.type() == RegType::vgpr);
3838    Temp coords = get_image_coords(ctx, instr, type);
3839    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3840
3841    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
3842    store->operands[0] = Operand(coords);
3843    store->operands[1] = Operand(resource);
3844    store->operands[2] = Operand(s4);
3845    store->operands[3] = Operand(data);
3846    store->glc = glc;
3847    store->dlc = false;
3848    store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3849    store->dmask = (1 << data.size()) - 1;
3850    store->unrm = true;
3851    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3852    store->disable_wqm = true;
3853    store->barrier = barrier_image;
3854    ctx->program->needs_exact = true;
3855    ctx->block->instructions.emplace_back(std::move(store));
3856    return;
3857 }
3858
3859 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
3860 {
3861    /* return the previous value if dest is ever used */
3862    bool return_previous = false;
3863    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
3864       return_previous = true;
3865       break;
3866    }
3867    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
3868       return_previous = true;
3869       break;
3870    }
3871
3872    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3873    const struct glsl_type *type = glsl_without_array(var->type);
3874    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3875    bool is_array = glsl_sampler_type_is_array(type);
3876    Builder bld(ctx->program, ctx->block);
3877
3878    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3879    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
3880
3881    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
3882       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
3883
3884    aco_opcode buf_op, image_op;
3885    switch (instr->intrinsic) {
3886       case nir_intrinsic_image_deref_atomic_add:
3887          buf_op = aco_opcode::buffer_atomic_add;
3888          image_op = aco_opcode::image_atomic_add;
3889          break;
3890       case nir_intrinsic_image_deref_atomic_umin:
3891          buf_op = aco_opcode::buffer_atomic_umin;
3892          image_op = aco_opcode::image_atomic_umin;
3893          break;
3894       case nir_intrinsic_image_deref_atomic_imin:
3895          buf_op = aco_opcode::buffer_atomic_smin;
3896          image_op = aco_opcode::image_atomic_smin;
3897          break;
3898       case nir_intrinsic_image_deref_atomic_umax:
3899          buf_op = aco_opcode::buffer_atomic_umax;
3900          image_op = aco_opcode::image_atomic_umax;
3901          break;
3902       case nir_intrinsic_image_deref_atomic_imax:
3903          buf_op = aco_opcode::buffer_atomic_smax;
3904          image_op = aco_opcode::image_atomic_smax;
3905          break;
3906       case nir_intrinsic_image_deref_atomic_and:
3907          buf_op = aco_opcode::buffer_atomic_and;
3908          image_op = aco_opcode::image_atomic_and;
3909          break;
3910       case nir_intrinsic_image_deref_atomic_or:
3911          buf_op = aco_opcode::buffer_atomic_or;
3912          image_op = aco_opcode::image_atomic_or;
3913          break;
3914       case nir_intrinsic_image_deref_atomic_xor:
3915          buf_op = aco_opcode::buffer_atomic_xor;
3916          image_op = aco_opcode::image_atomic_xor;
3917          break;
3918       case nir_intrinsic_image_deref_atomic_exchange:
3919          buf_op = aco_opcode::buffer_atomic_swap;
3920          image_op = aco_opcode::image_atomic_swap;
3921          break;
3922       case nir_intrinsic_image_deref_atomic_comp_swap:
3923          buf_op = aco_opcode::buffer_atomic_cmpswap;
3924          image_op = aco_opcode::image_atomic_cmpswap;
3925          break;
3926       default:
3927          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
3928    }
3929
3930    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3931
3932    if (dim == GLSL_SAMPLER_DIM_BUF) {
3933       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3934       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3935       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
3936       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
3937       mubuf->operands[0] = Operand(vindex);
3938       mubuf->operands[1] = Operand(resource);
3939       mubuf->operands[2] = Operand((uint32_t)0);
3940       mubuf->operands[3] = Operand(data);
3941       if (return_previous)
3942          mubuf->definitions[0] = Definition(dst);
3943       mubuf->offset = 0;
3944       mubuf->idxen = true;
3945       mubuf->glc = return_previous;
3946       mubuf->dlc = false; /* Not needed for atomics */
3947       mubuf->disable_wqm = true;
3948       mubuf->barrier = barrier_image;
3949       ctx->program->needs_exact = true;
3950       ctx->block->instructions.emplace_back(std::move(mubuf));
3951       return;
3952    }
3953
3954    Temp coords = get_image_coords(ctx, instr, type);
3955    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3956    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
3957    mimg->operands[0] = Operand(coords);
3958    mimg->operands[1] = Operand(resource);
3959    mimg->operands[2] = Operand(s4); /* no sampler */
3960    mimg->operands[3] = Operand(data);
3961    if (return_previous)
3962       mimg->definitions[0] = Definition(dst);
3963    mimg->glc = return_previous;
3964    mimg->dlc = false; /* Not needed for atomics */
3965    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
3966    mimg->dmask = (1 << data.size()) - 1;
3967    mimg->unrm = true;
3968    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3969    mimg->disable_wqm = true;
3970    mimg->barrier = barrier_image;
3971    ctx->program->needs_exact = true;
3972    ctx->block->instructions.emplace_back(std::move(mimg));
3973    return;
3974 }
3975
3976 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
3977 {
3978    if (in_elements && ctx->options->chip_class == GFX8) {
3979       Builder bld(ctx->program, ctx->block);
3980
3981       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
3982       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
3983       stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
3984       stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
3985
3986       Temp size = emit_extract_vector(ctx, desc, 2, s1);
3987       size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
3988
3989       Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
3990       res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
3991       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
3992
3993       // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
3994       /* idea
3995        * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
3996        * in case 12 (or 3?), we have to divide by 3:
3997        * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
3998        * use v_mul_hi_u32 with magic number to divide
3999        * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4000        * disable v_skip
4001        * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4002        */
4003
4004    } else {
4005       emit_extract_vector(ctx, desc, 2, dst);
4006    }
4007 }
4008
4009 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4010 {
4011    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4012    const struct glsl_type *type = glsl_without_array(var->type);
4013    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
4014    bool is_array = glsl_sampler_type_is_array(type);
4015    Builder bld(ctx->program, ctx->block);
4016
4017    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4018       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4019       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4020    }
4021
4022    /* LOD */
4023    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4024
4025    /* Resource */
4026    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4027
4028    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4029
4030    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4031    mimg->operands[0] = Operand(lod);
4032    mimg->operands[1] = Operand(resource);
4033    unsigned& dmask = mimg->dmask;
4034    mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
4035    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4036    mimg->da = glsl_sampler_type_is_array(type);
4037    mimg->can_reorder = true;
4038    Definition& def = mimg->definitions[0];
4039    ctx->block->instructions.emplace_back(std::move(mimg));
4040
4041    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4042        glsl_sampler_type_is_array(type)) {
4043
4044       assert(instr->dest.ssa.num_components == 3);
4045       Temp tmp = {ctx->program->allocateId(), v3};
4046       def = Definition(tmp);
4047       emit_split_vector(ctx, tmp, 3);
4048
4049       /* divide 3rd value by 6 by multiplying with magic number */
4050       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4051       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4052
4053       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4054                  emit_extract_vector(ctx, tmp, 0, v1),
4055                  emit_extract_vector(ctx, tmp, 1, v1),
4056                  by_6);
4057
4058    } else if (ctx->options->chip_class == GFX9 &&
4059               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4060               glsl_sampler_type_is_array(type)) {
4061       assert(instr->dest.ssa.num_components == 2);
4062       def = Definition(dst);
4063       dmask = 0x5;
4064    } else {
4065       def = Definition(dst);
4066    }
4067
4068    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4069 }
4070
4071 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4072 {
4073    Builder bld(ctx->program, ctx->block);
4074    unsigned num_components = instr->num_components;
4075
4076    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4077    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4078    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4079
4080    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4081    load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc);
4082 }
4083
4084 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4085 {
4086    Builder bld(ctx->program, ctx->block);
4087    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4088    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4089    unsigned writemask = nir_intrinsic_write_mask(instr);
4090
4091    Temp offset;
4092    if (ctx->options->chip_class < GFX8)
4093       offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4094    else
4095       offset = get_ssa_temp(ctx, instr->src[2].ssa);
4096
4097    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4098    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4099
4100    bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4101                ctx->options->chip_class >= GFX8;
4102    if (smem)
4103       offset = bld.as_uniform(offset);
4104    bool smem_nonfs = smem && ctx->stage != fragment_fs;
4105
4106    while (writemask) {
4107       int start, count;
4108       u_bit_scan_consecutive_range(&writemask, &start, &count);
4109       if (count == 3 && smem) {
4110          writemask |= 1u << (start + 2);
4111          count = 2;
4112       }
4113       int num_bytes = count * elem_size_bytes;
4114
4115       if (num_bytes > 16) {
4116          assert(elem_size_bytes == 8);
4117          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4118          count = 2;
4119          num_bytes = 16;
4120       }
4121
4122       // TODO: check alignment of sub-dword stores
4123       // TODO: split 3 bytes. there is no store instruction for that
4124
4125       Temp write_data;
4126       if (count != instr->num_components) {
4127          emit_split_vector(ctx, data, instr->num_components);
4128          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4129          for (int i = 0; i < count; i++) {
4130             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4131             vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4132          }
4133          write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4134          vec->definitions[0] = Definition(write_data);
4135          ctx->block->instructions.emplace_back(std::move(vec));
4136       } else if (!smem && data.type() != RegType::vgpr) {
4137          assert(num_bytes % 4 == 0);
4138          write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4139       } else if (smem_nonfs && data.type() == RegType::vgpr) {
4140          assert(num_bytes % 4 == 0);
4141          write_data = bld.as_uniform(data);
4142       } else {
4143          write_data = data;
4144       }
4145
4146       aco_opcode vmem_op, smem_op;
4147       switch (num_bytes) {
4148          case 4:
4149             vmem_op = aco_opcode::buffer_store_dword;
4150             smem_op = aco_opcode::s_buffer_store_dword;
4151             break;
4152          case 8:
4153             vmem_op = aco_opcode::buffer_store_dwordx2;
4154             smem_op = aco_opcode::s_buffer_store_dwordx2;
4155             break;
4156          case 12:
4157             vmem_op = aco_opcode::buffer_store_dwordx3;
4158             smem_op = aco_opcode::last_opcode;
4159             assert(!smem);
4160             break;
4161          case 16:
4162             vmem_op = aco_opcode::buffer_store_dwordx4;
4163             smem_op = aco_opcode::s_buffer_store_dwordx4;
4164             break;
4165          default:
4166             unreachable("Store SSBO not implemented for this size.");
4167       }
4168       if (ctx->stage == fragment_fs)
4169          smem_op = aco_opcode::p_fs_buffer_store_smem;
4170
4171       if (smem) {
4172          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4173          store->operands[0] = Operand(rsrc);
4174          if (start) {
4175             Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4176                                 offset, Operand(start * elem_size_bytes));
4177             store->operands[1] = Operand(off);
4178          } else {
4179             store->operands[1] = Operand(offset);
4180          }
4181          if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4182             store->operands[1].setFixed(m0);
4183          store->operands[2] = Operand(write_data);
4184          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4185          store->dlc = false;
4186          store->disable_wqm = true;
4187          store->barrier = barrier_buffer;
4188          ctx->block->instructions.emplace_back(std::move(store));
4189          ctx->program->wb_smem_l1_on_end = true;
4190          if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4191             ctx->block->kind |= block_kind_needs_lowering;
4192             ctx->program->needs_exact = true;
4193          }
4194       } else {
4195          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4196          store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4197          store->operands[1] = Operand(rsrc);
4198          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4199          store->operands[3] = Operand(write_data);
4200          store->offset = start * elem_size_bytes;
4201          store->offen = (offset.type() == RegType::vgpr);
4202          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4203          store->dlc = false;
4204          store->disable_wqm = true;
4205          store->barrier = barrier_buffer;
4206          ctx->program->needs_exact = true;
4207          ctx->block->instructions.emplace_back(std::move(store));
4208       }
4209    }
4210 }
4211
4212 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4213 {
4214    /* return the previous value if dest is ever used */
4215    bool return_previous = false;
4216    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4217       return_previous = true;
4218       break;
4219    }
4220    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4221       return_previous = true;
4222       break;
4223    }
4224
4225    Builder bld(ctx->program, ctx->block);
4226    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4227
4228    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4229       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4230                         get_ssa_temp(ctx, instr->src[3].ssa), data);
4231
4232    Temp offset;
4233    if (ctx->options->chip_class < GFX8)
4234       offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4235    else
4236       offset = get_ssa_temp(ctx, instr->src[1].ssa);
4237
4238    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4239    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4240
4241    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4242
4243    aco_opcode op32, op64;
4244    switch (instr->intrinsic) {
4245       case nir_intrinsic_ssbo_atomic_add:
4246          op32 = aco_opcode::buffer_atomic_add;
4247          op64 = aco_opcode::buffer_atomic_add_x2;
4248          break;
4249       case nir_intrinsic_ssbo_atomic_imin:
4250          op32 = aco_opcode::buffer_atomic_smin;
4251          op64 = aco_opcode::buffer_atomic_smin_x2;
4252          break;
4253       case nir_intrinsic_ssbo_atomic_umin:
4254          op32 = aco_opcode::buffer_atomic_umin;
4255          op64 = aco_opcode::buffer_atomic_umin_x2;
4256          break;
4257       case nir_intrinsic_ssbo_atomic_imax:
4258          op32 = aco_opcode::buffer_atomic_smax;
4259          op64 = aco_opcode::buffer_atomic_smax_x2;
4260          break;
4261       case nir_intrinsic_ssbo_atomic_umax:
4262          op32 = aco_opcode::buffer_atomic_umax;
4263          op64 = aco_opcode::buffer_atomic_umax_x2;
4264          break;
4265       case nir_intrinsic_ssbo_atomic_and:
4266          op32 = aco_opcode::buffer_atomic_and;
4267          op64 = aco_opcode::buffer_atomic_and_x2;
4268          break;
4269       case nir_intrinsic_ssbo_atomic_or:
4270          op32 = aco_opcode::buffer_atomic_or;
4271          op64 = aco_opcode::buffer_atomic_or_x2;
4272          break;
4273       case nir_intrinsic_ssbo_atomic_xor:
4274          op32 = aco_opcode::buffer_atomic_xor;
4275          op64 = aco_opcode::buffer_atomic_xor_x2;
4276          break;
4277       case nir_intrinsic_ssbo_atomic_exchange:
4278          op32 = aco_opcode::buffer_atomic_swap;
4279          op64 = aco_opcode::buffer_atomic_swap_x2;
4280          break;
4281       case nir_intrinsic_ssbo_atomic_comp_swap:
4282          op32 = aco_opcode::buffer_atomic_cmpswap;
4283          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4284          break;
4285       default:
4286          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4287    }
4288    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4289    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4290    mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4291    mubuf->operands[1] = Operand(rsrc);
4292    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4293    mubuf->operands[3] = Operand(data);
4294    if (return_previous)
4295       mubuf->definitions[0] = Definition(dst);
4296    mubuf->offset = 0;
4297    mubuf->offen = (offset.type() == RegType::vgpr);
4298    mubuf->glc = return_previous;
4299    mubuf->dlc = false; /* Not needed for atomics */
4300    mubuf->disable_wqm = true;
4301    mubuf->barrier = barrier_buffer;
4302    ctx->program->needs_exact = true;
4303    ctx->block->instructions.emplace_back(std::move(mubuf));
4304 }
4305
4306 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4307
4308    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4309    Builder bld(ctx->program, ctx->block);
4310    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4311    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4312 }
4313
4314 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4315 {
4316    Builder bld(ctx->program, ctx->block);
4317    unsigned num_components = instr->num_components;
4318    unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4319
4320    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4321    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4322
4323    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4324    bool dlc = glc && ctx->options->chip_class >= GFX10;
4325    aco_opcode op;
4326    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4327       bool global = ctx->options->chip_class >= GFX9;
4328       aco_opcode op;
4329       switch (num_bytes) {
4330       case 4:
4331          op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4332          break;
4333       case 8:
4334          op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4335          break;
4336       case 12:
4337          op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4338          break;
4339       case 16:
4340          op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4341          break;
4342       default:
4343          unreachable("load_global not implemented for this size.");
4344       }
4345       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4346       flat->operands[0] = Operand(addr);
4347       flat->operands[1] = Operand(s1);
4348       flat->glc = glc;
4349       flat->dlc = dlc;
4350
4351       if (dst.type() == RegType::sgpr) {
4352          Temp vec = bld.tmp(RegType::vgpr, dst.size());
4353          flat->definitions[0] = Definition(vec);
4354          ctx->block->instructions.emplace_back(std::move(flat));
4355          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4356       } else {
4357          flat->definitions[0] = Definition(dst);
4358          ctx->block->instructions.emplace_back(std::move(flat));
4359       }
4360       emit_split_vector(ctx, dst, num_components);
4361    } else {
4362       switch (num_bytes) {
4363          case 4:
4364             op = aco_opcode::s_load_dword;
4365             break;
4366          case 8:
4367             op = aco_opcode::s_load_dwordx2;
4368             break;
4369          case 12:
4370          case 16:
4371             op = aco_opcode::s_load_dwordx4;
4372             break;
4373          default:
4374             unreachable("load_global not implemented for this size.");
4375       }
4376       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4377       load->operands[0] = Operand(addr);
4378       load->operands[1] = Operand(0u);
4379       load->definitions[0] = Definition(dst);
4380       load->glc = glc;
4381       load->dlc = dlc;
4382       load->barrier = barrier_buffer;
4383       assert(ctx->options->chip_class >= GFX8 || !glc);
4384
4385       if (dst.size() == 3) {
4386          /* trim vector */
4387          Temp vec = bld.tmp(s4);
4388          load->definitions[0] = Definition(vec);
4389          ctx->block->instructions.emplace_back(std::move(load));
4390          emit_split_vector(ctx, vec, 4);
4391
4392          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4393                     emit_extract_vector(ctx, vec, 0, s1),
4394                     emit_extract_vector(ctx, vec, 1, s1),
4395                     emit_extract_vector(ctx, vec, 2, s1));
4396       } else {
4397          ctx->block->instructions.emplace_back(std::move(load));
4398       }
4399    }
4400 }
4401
4402 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4403 {
4404    Builder bld(ctx->program, ctx->block);
4405    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4406
4407    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4408    Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4409
4410    unsigned writemask = nir_intrinsic_write_mask(instr);
4411    while (writemask) {
4412       int start, count;
4413       u_bit_scan_consecutive_range(&writemask, &start, &count);
4414       unsigned num_bytes = count * elem_size_bytes;
4415
4416       Temp write_data = data;
4417       if (count != instr->num_components) {
4418          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4419          for (int i = 0; i < count; i++)
4420             vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4421          write_data = bld.tmp(RegType::vgpr, count);
4422          vec->definitions[0] = Definition(write_data);
4423          ctx->block->instructions.emplace_back(std::move(vec));
4424       }
4425
4426       unsigned offset = start * elem_size_bytes;
4427       if (offset > 0 && ctx->options->chip_class < GFX9) {
4428          Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4429          Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4430          Temp carry = bld.tmp(s2);
4431          bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4432
4433          bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4434                   Operand(offset), addr0);
4435          bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4436                   Operand(0u), addr1,
4437                   carry).def(1).setHint(vcc);
4438
4439          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4440
4441          offset = 0;
4442       }
4443
4444       bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4445       bool global = ctx->options->chip_class >= GFX9;
4446       aco_opcode op;
4447       switch (num_bytes) {
4448       case 4:
4449          op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4450          break;
4451       case 8:
4452          op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4453          break;
4454       case 12:
4455          op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4456          break;
4457       case 16:
4458          op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4459          break;
4460       default:
4461          unreachable("store_global not implemented for this size.");
4462       }
4463       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4464       flat->operands[0] = Operand(addr);
4465       flat->operands[1] = Operand(s1);
4466       flat->operands[2] = Operand(data);
4467       flat->glc = glc;
4468       flat->dlc = false;
4469       flat->offset = offset;
4470       ctx->block->instructions.emplace_back(std::move(flat));
4471    }
4472 }
4473
4474 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4475    Builder bld(ctx->program, ctx->block);
4476    switch(instr->intrinsic) {
4477       case nir_intrinsic_group_memory_barrier:
4478       case nir_intrinsic_memory_barrier:
4479          bld.barrier(aco_opcode::p_memory_barrier_all);
4480          break;
4481       case nir_intrinsic_memory_barrier_atomic_counter:
4482          bld.barrier(aco_opcode::p_memory_barrier_atomic);
4483          break;
4484       case nir_intrinsic_memory_barrier_buffer:
4485          bld.barrier(aco_opcode::p_memory_barrier_buffer);
4486          break;
4487       case nir_intrinsic_memory_barrier_image:
4488          bld.barrier(aco_opcode::p_memory_barrier_image);
4489          break;
4490       case nir_intrinsic_memory_barrier_shared:
4491          bld.barrier(aco_opcode::p_memory_barrier_shared);
4492          break;
4493       default:
4494          unreachable("Unimplemented memory barrier intrinsic");
4495          break;
4496    }
4497 }
4498
4499 Operand load_lds_size_m0(isel_context *ctx)
4500 {
4501    /* TODO: m0 does not need to be initialized on GFX9+ */
4502    Builder bld(ctx->program, ctx->block);
4503    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
4504 }
4505
4506
4507 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4508 {
4509    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4510    Operand m = load_lds_size_m0(ctx);
4511    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4512    assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4513    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4514    Builder bld(ctx->program, ctx->block);
4515
4516    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4517    unsigned bytes_read = 0;
4518    unsigned result_size = 0;
4519    unsigned total_bytes = instr->num_components * elem_size_bytes;
4520    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : instr->dest.ssa.bit_size / 8;
4521    std::array<Temp, 4> result;
4522
4523    while (bytes_read < total_bytes) {
4524       unsigned todo = total_bytes - bytes_read;
4525       bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
4526       bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
4527
4528       aco_opcode op = aco_opcode::last_opcode;
4529       if (todo >= 16 && aligned16) {
4530          op = aco_opcode::ds_read_b128;
4531          todo = 16;
4532       } else if (todo >= 12 && aligned16) {
4533          op = aco_opcode::ds_read_b96;
4534          todo = 12;
4535       } else if (todo >= 8) {
4536          op = aligned8 ? aco_opcode::ds_read_b64 : aco_opcode::ds_read2_b32;
4537          todo = 8;
4538       } else if (todo >= 4) {
4539          op = aco_opcode::ds_read_b32;
4540          todo = 4;
4541       } else {
4542          assert(false);
4543       }
4544       assert(todo % elem_size_bytes == 0);
4545       unsigned num_elements = todo / elem_size_bytes;
4546       unsigned offset = nir_intrinsic_base(instr) + bytes_read;
4547       unsigned max_offset = op == aco_opcode::ds_read2_b32 ? 1019 : 65535;
4548
4549       Temp address_offset = address;
4550       if (offset > max_offset) {
4551          address_offset = bld.vadd32(bld.def(v1), Operand((uint32_t)nir_intrinsic_base(instr)), address_offset);
4552          offset = bytes_read;
4553       }
4554       assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
4555
4556       Temp res;
4557       if (instr->num_components == 1 && dst.type() == RegType::vgpr)
4558          res = dst;
4559       else
4560          res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
4561
4562       if (op == aco_opcode::ds_read2_b32)
4563          res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
4564       else
4565          res = bld.ds(op, Definition(res), address_offset, m, offset);
4566
4567       if (instr->num_components == 1) {
4568          assert(todo == total_bytes);
4569          if (dst.type() == RegType::sgpr)
4570             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4571          return;
4572       }
4573
4574       if (dst.type() == RegType::sgpr)
4575          res = bld.as_uniform(res);
4576
4577       if (num_elements == 1) {
4578          result[result_size++] = res;
4579       } else {
4580          assert(res != dst && res.size() % num_elements == 0);
4581          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
4582          split->operands[0] = Operand(res);
4583          for (unsigned i = 0; i < num_elements; i++)
4584             split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
4585          ctx->block->instructions.emplace_back(std::move(split));
4586       }
4587
4588       bytes_read += todo;
4589    }
4590
4591    assert(result_size == instr->num_components && result_size > 1);
4592    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
4593    for (unsigned i = 0; i < result_size; i++)
4594       vec->operands[i] = Operand(result[i]);
4595    vec->definitions[0] = Definition(dst);
4596    ctx->block->instructions.emplace_back(std::move(vec));
4597    ctx->allocated_vec.emplace(dst.id(), result);
4598 }
4599
4600 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned offset0, unsigned offset1, unsigned align)
4601 {
4602    Builder bld(ctx->program, ctx->block);
4603    unsigned bytes_written = 0;
4604    while (bytes_written < data.size() * 4) {
4605       unsigned todo = data.size() * 4 - bytes_written;
4606       bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
4607       bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
4608
4609       aco_opcode op = aco_opcode::last_opcode;
4610       unsigned size = 0;
4611       if (todo >= 16 && aligned16) {
4612          op = aco_opcode::ds_write_b128;
4613          size = 4;
4614       } else if (todo >= 12 && aligned16) {
4615          op = aco_opcode::ds_write_b96;
4616          size = 3;
4617       } else if (todo >= 8) {
4618          op = aligned8 ? aco_opcode::ds_write_b64 : aco_opcode::ds_write2_b32;
4619          size = 2;
4620       } else if (todo >= 4) {
4621          op = aco_opcode::ds_write_b32;
4622          size = 1;
4623       } else {
4624          assert(false);
4625       }
4626
4627       bool write2 = op == aco_opcode::ds_write2_b32;
4628       unsigned offset = offset0 + offset1 + bytes_written;
4629       unsigned max_offset = write2 ? 1020 : 65535;
4630       Temp address_offset = address;
4631       if (offset > max_offset) {
4632          address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
4633          offset = offset1 + bytes_written;
4634       }
4635       assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
4636
4637       if (write2) {
4638          Temp val0 = emit_extract_vector(ctx, data, bytes_written >> 2, v1);
4639          Temp val1 = emit_extract_vector(ctx, data, (bytes_written >> 2) + 1, v1);
4640          bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
4641       } else {
4642          Temp val = emit_extract_vector(ctx, data, bytes_written >> 2, RegClass(RegType::vgpr, size));
4643          bld.ds(op, address_offset, val, m, offset);
4644       }
4645
4646       bytes_written += size * 4;
4647    }
4648 }
4649
4650 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4651 {
4652    unsigned offset = nir_intrinsic_base(instr);
4653    unsigned writemask = nir_intrinsic_write_mask(instr);
4654    Operand m = load_lds_size_m0(ctx);
4655    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4656    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4657    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4658    assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4659
4660    /* we need at most two stores for 32bit variables */
4661    int start[2], count[2];
4662    u_bit_scan_consecutive_range(&writemask, &start[0], &count[0]);
4663    u_bit_scan_consecutive_range(&writemask, &start[1], &count[1]);
4664    assert(writemask == 0);
4665
4666    /* one combined store is sufficient */
4667    if (count[0] == count[1]) {
4668       Builder bld(ctx->program, ctx->block);
4669
4670       Temp address_offset = address;
4671       if ((offset >> 2) + start[1] > 255) {
4672          address_offset = bld.vadd32(bld.def(v1), Operand(offset), address_offset);
4673          offset = 0;
4674       }
4675
4676       assert(count[0] == 1);
4677       Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
4678       Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
4679       aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4680       offset = offset / elem_size_bytes;
4681       bld.ds(op, address_offset, val0, val1, m,
4682              offset + start[0], offset + start[1]);
4683       return;
4684    }
4685
4686    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4687    for (unsigned i = 0; i < 2; i++) {
4688       if (count[i] == 0)
4689          continue;
4690
4691       Temp write_data = emit_extract_vector(ctx, data, start[i], RegClass(RegType::vgpr, count[i] * elem_size_bytes / 4));
4692       ds_write_helper(ctx, m, address, write_data, offset, start[i] * elem_size_bytes, align);
4693    }
4694    return;
4695 }
4696
4697 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4698 {
4699    unsigned offset = nir_intrinsic_base(instr);
4700    Operand m = load_lds_size_m0(ctx);
4701    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4702    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4703
4704    unsigned num_operands = 3;
4705    aco_opcode op32, op64, op32_rtn, op64_rtn;
4706    switch(instr->intrinsic) {
4707       case nir_intrinsic_shared_atomic_add:
4708          op32 = aco_opcode::ds_add_u32;
4709          op64 = aco_opcode::ds_add_u64;
4710          op32_rtn = aco_opcode::ds_add_rtn_u32;
4711          op64_rtn = aco_opcode::ds_add_rtn_u64;
4712          break;
4713       case nir_intrinsic_shared_atomic_imin:
4714          op32 = aco_opcode::ds_min_i32;
4715          op64 = aco_opcode::ds_min_i64;
4716          op32_rtn = aco_opcode::ds_min_rtn_i32;
4717          op64_rtn = aco_opcode::ds_min_rtn_i64;
4718          break;
4719       case nir_intrinsic_shared_atomic_umin:
4720          op32 = aco_opcode::ds_min_u32;
4721          op64 = aco_opcode::ds_min_u64;
4722          op32_rtn = aco_opcode::ds_min_rtn_u32;
4723          op64_rtn = aco_opcode::ds_min_rtn_u64;
4724          break;
4725       case nir_intrinsic_shared_atomic_imax:
4726          op32 = aco_opcode::ds_max_i32;
4727          op64 = aco_opcode::ds_max_i64;
4728          op32_rtn = aco_opcode::ds_max_rtn_i32;
4729          op64_rtn = aco_opcode::ds_max_rtn_i64;
4730          break;
4731       case nir_intrinsic_shared_atomic_umax:
4732          op32 = aco_opcode::ds_max_u32;
4733          op64 = aco_opcode::ds_max_u64;
4734          op32_rtn = aco_opcode::ds_max_rtn_u32;
4735          op64_rtn = aco_opcode::ds_max_rtn_u64;
4736          break;
4737       case nir_intrinsic_shared_atomic_and:
4738          op32 = aco_opcode::ds_and_b32;
4739          op64 = aco_opcode::ds_and_b64;
4740          op32_rtn = aco_opcode::ds_and_rtn_b32;
4741          op64_rtn = aco_opcode::ds_and_rtn_b64;
4742          break;
4743       case nir_intrinsic_shared_atomic_or:
4744          op32 = aco_opcode::ds_or_b32;
4745          op64 = aco_opcode::ds_or_b64;
4746          op32_rtn = aco_opcode::ds_or_rtn_b32;
4747          op64_rtn = aco_opcode::ds_or_rtn_b64;
4748          break;
4749       case nir_intrinsic_shared_atomic_xor:
4750          op32 = aco_opcode::ds_xor_b32;
4751          op64 = aco_opcode::ds_xor_b64;
4752          op32_rtn = aco_opcode::ds_xor_rtn_b32;
4753          op64_rtn = aco_opcode::ds_xor_rtn_b64;
4754          break;
4755       case nir_intrinsic_shared_atomic_exchange:
4756          op32 = aco_opcode::ds_write_b32;
4757          op64 = aco_opcode::ds_write_b64;
4758          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4759          op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4760          break;
4761       case nir_intrinsic_shared_atomic_comp_swap:
4762          op32 = aco_opcode::ds_cmpst_b32;
4763          op64 = aco_opcode::ds_cmpst_b64;
4764          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4765          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4766          num_operands = 4;
4767          break;
4768       default:
4769          unreachable("Unhandled shared atomic intrinsic");
4770    }
4771
4772    /* return the previous value if dest is ever used */
4773    bool return_previous = false;
4774    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4775       return_previous = true;
4776       break;
4777    }
4778    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4779       return_previous = true;
4780       break;
4781    }
4782
4783    aco_opcode op;
4784    if (data.size() == 1) {
4785       assert(instr->dest.ssa.bit_size == 32);
4786       op = return_previous ? op32_rtn : op32;
4787    } else {
4788       assert(instr->dest.ssa.bit_size == 64);
4789       op = return_previous ? op64_rtn : op64;
4790    }
4791
4792    if (offset > 65535) {
4793       Builder bld(ctx->program, ctx->block);
4794       address = bld.vadd32(bld.def(v1), Operand(offset), address);
4795       offset = 0;
4796    }
4797
4798    aco_ptr<DS_instruction> ds;
4799    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
4800    ds->operands[0] = Operand(address);
4801    ds->operands[1] = Operand(data);
4802    if (num_operands == 4)
4803       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
4804    ds->operands[num_operands - 1] = m;
4805    ds->offset0 = offset;
4806    if (return_previous)
4807       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
4808    ctx->block->instructions.emplace_back(std::move(ds));
4809 }
4810
4811 Temp get_scratch_resource(isel_context *ctx)
4812 {
4813    Builder bld(ctx->program, ctx->block);
4814    Temp scratch_addr = ctx->private_segment_buffer;
4815    if (ctx->stage != compute_cs)
4816       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
4817
4818    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
4819                         S_008F0C_INDEX_STRIDE(ctx->options->wave_size == 64 ? 3 : 2);;
4820
4821    if (ctx->program->chip_class >= GFX10) {
4822       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
4823                    S_008F0C_OOB_SELECT(3) |
4824                    S_008F0C_RESOURCE_LEVEL(1);
4825    } else if (ctx->program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */
4826       rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
4827                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
4828    }
4829
4830    /* older generations need element size = 16 bytes. element size removed in GFX9 */
4831    if (ctx->program->chip_class <= GFX8)
4832       rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
4833
4834    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
4835 }
4836
4837 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4838    assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
4839    Builder bld(ctx->program, ctx->block);
4840    Temp rsrc = get_scratch_resource(ctx);
4841    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4842    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4843
4844    aco_opcode op;
4845    switch (dst.size()) {
4846       case 1:
4847          op = aco_opcode::buffer_load_dword;
4848          break;
4849       case 2:
4850          op = aco_opcode::buffer_load_dwordx2;
4851          break;
4852       case 3:
4853          op = aco_opcode::buffer_load_dwordx3;
4854          break;
4855       case 4:
4856          op = aco_opcode::buffer_load_dwordx4;
4857          break;
4858       case 6:
4859       case 8: {
4860          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4861          Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
4862                                 bld.def(v4), offset, rsrc,
4863                                 ctx->scratch_offset, 0, true);
4864          Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
4865                                                   aco_opcode::buffer_load_dwordx4,
4866                                 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
4867                                 offset, rsrc, ctx->scratch_offset, 16, true);
4868          emit_split_vector(ctx, lower, 2);
4869          elems[0] = emit_extract_vector(ctx, lower, 0, v2);
4870          elems[1] = emit_extract_vector(ctx, lower, 1, v2);
4871          if (dst.size() == 8) {
4872             emit_split_vector(ctx, upper, 2);
4873             elems[2] = emit_extract_vector(ctx, upper, 0, v2);
4874             elems[3] = emit_extract_vector(ctx, upper, 1, v2);
4875          } else {
4876             elems[2] = upper;
4877          }
4878
4879          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4880                                                                          Format::PSEUDO, dst.size() / 2, 1)};
4881          for (unsigned i = 0; i < dst.size() / 2; i++)
4882             vec->operands[i] = Operand(elems[i]);
4883          vec->definitions[0] = Definition(dst);
4884          bld.insert(std::move(vec));
4885          ctx->allocated_vec.emplace(dst.id(), elems);
4886          return;
4887       }
4888       default:
4889          unreachable("Wrong dst size for nir_intrinsic_load_scratch");
4890    }
4891
4892    bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
4893    emit_split_vector(ctx, dst, instr->num_components);
4894 }
4895
4896 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4897    assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
4898    Builder bld(ctx->program, ctx->block);
4899    Temp rsrc = get_scratch_resource(ctx);
4900    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4901    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4902
4903    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4904    unsigned writemask = nir_intrinsic_write_mask(instr);
4905
4906    while (writemask) {
4907       int start, count;
4908       u_bit_scan_consecutive_range(&writemask, &start, &count);
4909       int num_bytes = count * elem_size_bytes;
4910
4911       if (num_bytes > 16) {
4912          assert(elem_size_bytes == 8);
4913          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4914          count = 2;
4915          num_bytes = 16;
4916       }
4917
4918       // TODO: check alignment of sub-dword stores
4919       // TODO: split 3 bytes. there is no store instruction for that
4920
4921       Temp write_data;
4922       if (count != instr->num_components) {
4923          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4924          for (int i = 0; i < count; i++) {
4925             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
4926             vec->operands[i] = Operand(elem);
4927          }
4928          write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
4929          vec->definitions[0] = Definition(write_data);
4930          ctx->block->instructions.emplace_back(std::move(vec));
4931       } else {
4932          write_data = data;
4933       }
4934
4935       aco_opcode op;
4936       switch (num_bytes) {
4937          case 4:
4938             op = aco_opcode::buffer_store_dword;
4939             break;
4940          case 8:
4941             op = aco_opcode::buffer_store_dwordx2;
4942             break;
4943          case 12:
4944             op = aco_opcode::buffer_store_dwordx3;
4945             break;
4946          case 16:
4947             op = aco_opcode::buffer_store_dwordx4;
4948             break;
4949          default:
4950             unreachable("Invalid data size for nir_intrinsic_store_scratch.");
4951       }
4952
4953       bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
4954    }
4955 }
4956
4957 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
4958    uint8_t log2_ps_iter_samples;
4959    if (ctx->program->info->ps.force_persample) {
4960       log2_ps_iter_samples =
4961          util_logbase2(ctx->options->key.fs.num_samples);
4962    } else {
4963       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
4964    }
4965
4966    /* The bit pattern matches that used by fixed function fragment
4967     * processing. */
4968    static const unsigned ps_iter_masks[] = {
4969       0xffff, /* not used */
4970       0x5555,
4971       0x1111,
4972       0x0101,
4973       0x0001,
4974    };
4975    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
4976
4977    Builder bld(ctx->program, ctx->block);
4978
4979    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
4980    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
4981    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
4982    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4983    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
4984 }
4985
4986 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
4987 {
4988    Builder bld(ctx->program, ctx->block);
4989
4990    if (cluster_size == 1) {
4991       return src;
4992    } if (op == nir_op_iand && cluster_size == 4) {
4993       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
4994       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
4995       return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
4996                       bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
4997    } else if (op == nir_op_ior && cluster_size == 4) {
4998       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
4999       return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
5000                       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
5001    } else if (op == nir_op_iand && cluster_size == 64) {
5002       //subgroupAnd(val) -> (exec & ~val) == 0
5003       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
5004       return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
5005    } else if (op == nir_op_ior && cluster_size == 64) {
5006       //subgroupOr(val) -> (val & exec) != 0
5007       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
5008    } else if (op == nir_op_ixor && cluster_size == 64) {
5009       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5010       Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5011       tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
5012       return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5013    } else {
5014       //subgroupClustered{And,Or,Xor}(val, n) ->
5015       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5016       //cluster_offset = ~(n - 1) & lane_id
5017       //cluster_mask = ((1 << n) - 1)
5018       //subgroupClusteredAnd():
5019       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5020       //subgroupClusteredOr():
5021       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
5022       //subgroupClusteredXor():
5023       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5024       Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5025                               bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5026       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5027
5028       Temp tmp;
5029       if (op == nir_op_iand)
5030          tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5031       else
5032          tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5033
5034       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5035       tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5036       tmp = emit_extract_vector(ctx, tmp, 0, v1);
5037       if (cluster_mask != 0xffffffff)
5038          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5039
5040       Definition cmp_def = Definition();
5041       if (op == nir_op_iand) {
5042          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5043       } else if (op == nir_op_ior) {
5044          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5045       } else if (op == nir_op_ixor) {
5046          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5047                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5048          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5049       }
5050       cmp_def.setHint(vcc);
5051       return cmp_def.getTemp();
5052    }
5053 }
5054
5055 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5056 {
5057    Builder bld(ctx->program, ctx->block);
5058
5059    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5060    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5061    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5062    Temp tmp;
5063    if (op == nir_op_iand)
5064       tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5065    else
5066       tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5067
5068    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5069    Temp lo = lohi.def(0).getTemp();
5070    Temp hi = lohi.def(1).getTemp();
5071    Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5072                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5073
5074    Definition cmp_def = Definition();
5075    if (op == nir_op_iand)
5076       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5077    else if (op == nir_op_ior)
5078       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5079    else if (op == nir_op_ixor)
5080       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5081                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5082    cmp_def.setHint(vcc);
5083    return cmp_def.getTemp();
5084 }
5085
5086 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5087 {
5088    Builder bld(ctx->program, ctx->block);
5089
5090    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5091    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5092    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5093    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5094    if (op == nir_op_iand)
5095       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5096    else if (op == nir_op_ior)
5097       return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5098    else if (op == nir_op_ixor)
5099       return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5100
5101    assert(false);
5102    return Temp();
5103 }
5104
5105 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5106 {
5107    Builder bld(ctx->program, ctx->block);
5108    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5109    if (src.regClass().type() == RegType::vgpr) {
5110       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5111    } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5112       bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
5113    } else if (src.regClass() == s1) {
5114       bld.sop1(aco_opcode::s_mov_b32, dst, src);
5115    } else if (src.regClass() == s2) {
5116       bld.sop1(aco_opcode::s_mov_b64, dst, src);
5117    } else {
5118       fprintf(stderr, "Unimplemented NIR instr bit size: ");
5119       nir_print_instr(&instr->instr, stderr);
5120       fprintf(stderr, "\n");
5121    }
5122 }
5123
5124 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5125 {
5126    Builder bld(ctx->program, ctx->block);
5127    Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
5128    Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
5129
5130    /* Build DD X/Y */
5131    Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5132    Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5133    Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5134    Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5135    Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5136    Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5137
5138    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5139    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5140    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5141    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5142    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5143    Temp wqm1 = bld.tmp(v1);
5144    emit_wqm(ctx, tmp1, wqm1, true);
5145    Temp wqm2 = bld.tmp(v1);
5146    emit_wqm(ctx, tmp2, wqm2, true);
5147    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5148    return;
5149 }
5150
5151 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5152 {
5153    Builder bld(ctx->program, ctx->block);
5154    switch(instr->intrinsic) {
5155    case nir_intrinsic_load_barycentric_sample:
5156    case nir_intrinsic_load_barycentric_pixel:
5157    case nir_intrinsic_load_barycentric_centroid: {
5158       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5159       fs_input input = get_interp_input(instr->intrinsic, mode);
5160
5161       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5162       if (input == fs_input::max_inputs) {
5163          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5164                     Operand(0u), Operand(0u));
5165       } else {
5166          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5167                     ctx->fs_inputs[input],
5168                     ctx->fs_inputs[input + 1]);
5169       }
5170       emit_split_vector(ctx, dst, 2);
5171       break;
5172    }
5173    case nir_intrinsic_load_barycentric_at_sample: {
5174       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5175       switch (ctx->options->key.fs.num_samples) {
5176          case 2: sample_pos_offset += 1 << 3; break;
5177          case 4: sample_pos_offset += 3 << 3; break;
5178          case 8: sample_pos_offset += 7 << 3; break;
5179          default: break;
5180       }
5181       Temp sample_pos;
5182       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5183       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5184       if (addr.type() == RegType::sgpr) {
5185          Operand offset;
5186          if (const_addr) {
5187             sample_pos_offset += const_addr->u32 << 3;
5188             offset = Operand(sample_pos_offset);
5189          } else if (ctx->options->chip_class >= GFX9) {
5190             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5191          } else {
5192             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5193             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5194          }
5195          addr = ctx->private_segment_buffer;
5196          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
5197
5198       } else if (ctx->options->chip_class >= GFX9) {
5199          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5200          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
5201       } else {
5202          /* addr += ctx->private_segment_buffer + sample_pos_offset */
5203          Temp tmp0 = bld.tmp(s1);
5204          Temp tmp1 = bld.tmp(s1);
5205          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
5206          Definition scc_tmp = bld.def(s1, scc);
5207          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5208          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp());
5209          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5210          Temp pck0 = bld.tmp(v1);
5211          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5212          tmp1 = as_vgpr(ctx, tmp1);
5213          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5214          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5215
5216          /* sample_pos = flat_load_dwordx2 addr */
5217          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5218       }
5219
5220       /* sample_pos -= 0.5 */
5221       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5222       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5223       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5224       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5225       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5226
5227       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5228       break;
5229    }
5230    case nir_intrinsic_load_barycentric_at_offset: {
5231       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5232       RegClass rc = RegClass(offset.type(), 1);
5233       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5234       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5235       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5236       break;
5237    }
5238    case nir_intrinsic_load_front_face: {
5239       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5240                Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
5241       break;
5242    }
5243    case nir_intrinsic_load_view_index:
5244    case nir_intrinsic_load_layer_id: {
5245       if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5246          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5247          bld.copy(Definition(dst), Operand(ctx->view_index));
5248          break;
5249       }
5250
5251       unsigned idx = nir_intrinsic_base(instr);
5252       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5253                  Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
5254       break;
5255    }
5256    case nir_intrinsic_load_frag_coord: {
5257       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5258       break;
5259    }
5260    case nir_intrinsic_load_sample_pos: {
5261       Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
5262       Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
5263       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5264                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5265                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5266       break;
5267    }
5268    case nir_intrinsic_load_interpolated_input:
5269       visit_load_interpolated_input(ctx, instr);
5270       break;
5271    case nir_intrinsic_store_output:
5272       visit_store_output(ctx, instr);
5273       break;
5274    case nir_intrinsic_load_input:
5275       visit_load_input(ctx, instr);
5276       break;
5277    case nir_intrinsic_load_ubo:
5278       visit_load_ubo(ctx, instr);
5279       break;
5280    case nir_intrinsic_load_push_constant:
5281       visit_load_push_constant(ctx, instr);
5282       break;
5283    case nir_intrinsic_load_constant:
5284       visit_load_constant(ctx, instr);
5285       break;
5286    case nir_intrinsic_vulkan_resource_index:
5287       visit_load_resource(ctx, instr);
5288       break;
5289    case nir_intrinsic_discard:
5290       visit_discard(ctx, instr);
5291       break;
5292    case nir_intrinsic_discard_if:
5293       visit_discard_if(ctx, instr);
5294       break;
5295    case nir_intrinsic_load_shared:
5296       visit_load_shared(ctx, instr);
5297       break;
5298    case nir_intrinsic_store_shared:
5299       visit_store_shared(ctx, instr);
5300       break;
5301    case nir_intrinsic_shared_atomic_add:
5302    case nir_intrinsic_shared_atomic_imin:
5303    case nir_intrinsic_shared_atomic_umin:
5304    case nir_intrinsic_shared_atomic_imax:
5305    case nir_intrinsic_shared_atomic_umax:
5306    case nir_intrinsic_shared_atomic_and:
5307    case nir_intrinsic_shared_atomic_or:
5308    case nir_intrinsic_shared_atomic_xor:
5309    case nir_intrinsic_shared_atomic_exchange:
5310    case nir_intrinsic_shared_atomic_comp_swap:
5311       visit_shared_atomic(ctx, instr);
5312       break;
5313    case nir_intrinsic_image_deref_load:
5314       visit_image_load(ctx, instr);
5315       break;
5316    case nir_intrinsic_image_deref_store:
5317       visit_image_store(ctx, instr);
5318       break;
5319    case nir_intrinsic_image_deref_atomic_add:
5320    case nir_intrinsic_image_deref_atomic_umin:
5321    case nir_intrinsic_image_deref_atomic_imin:
5322    case nir_intrinsic_image_deref_atomic_umax:
5323    case nir_intrinsic_image_deref_atomic_imax:
5324    case nir_intrinsic_image_deref_atomic_and:
5325    case nir_intrinsic_image_deref_atomic_or:
5326    case nir_intrinsic_image_deref_atomic_xor:
5327    case nir_intrinsic_image_deref_atomic_exchange:
5328    case nir_intrinsic_image_deref_atomic_comp_swap:
5329       visit_image_atomic(ctx, instr);
5330       break;
5331    case nir_intrinsic_image_deref_size:
5332       visit_image_size(ctx, instr);
5333       break;
5334    case nir_intrinsic_load_ssbo:
5335       visit_load_ssbo(ctx, instr);
5336       break;
5337    case nir_intrinsic_store_ssbo:
5338       visit_store_ssbo(ctx, instr);
5339       break;
5340    case nir_intrinsic_load_global:
5341       visit_load_global(ctx, instr);
5342       break;
5343    case nir_intrinsic_store_global:
5344       visit_store_global(ctx, instr);
5345       break;
5346    case nir_intrinsic_ssbo_atomic_add:
5347    case nir_intrinsic_ssbo_atomic_imin:
5348    case nir_intrinsic_ssbo_atomic_umin:
5349    case nir_intrinsic_ssbo_atomic_imax:
5350    case nir_intrinsic_ssbo_atomic_umax:
5351    case nir_intrinsic_ssbo_atomic_and:
5352    case nir_intrinsic_ssbo_atomic_or:
5353    case nir_intrinsic_ssbo_atomic_xor:
5354    case nir_intrinsic_ssbo_atomic_exchange:
5355    case nir_intrinsic_ssbo_atomic_comp_swap:
5356       visit_atomic_ssbo(ctx, instr);
5357       break;
5358    case nir_intrinsic_load_scratch:
5359       visit_load_scratch(ctx, instr);
5360       break;
5361    case nir_intrinsic_store_scratch:
5362       visit_store_scratch(ctx, instr);
5363       break;
5364    case nir_intrinsic_get_buffer_size:
5365       visit_get_buffer_size(ctx, instr);
5366       break;
5367    case nir_intrinsic_barrier: {
5368       unsigned* bsize = ctx->program->info->cs.block_size;
5369       unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5370       if (workgroup_size > 64)
5371          bld.sopp(aco_opcode::s_barrier);
5372       break;
5373    }
5374    case nir_intrinsic_group_memory_barrier:
5375    case nir_intrinsic_memory_barrier:
5376    case nir_intrinsic_memory_barrier_atomic_counter:
5377    case nir_intrinsic_memory_barrier_buffer:
5378    case nir_intrinsic_memory_barrier_image:
5379    case nir_intrinsic_memory_barrier_shared:
5380       emit_memory_barrier(ctx, instr);
5381       break;
5382    case nir_intrinsic_load_num_work_groups:
5383    case nir_intrinsic_load_work_group_id:
5384    case nir_intrinsic_load_local_invocation_id: {
5385       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5386       Temp* ids;
5387       if (instr->intrinsic == nir_intrinsic_load_num_work_groups)
5388          ids = ctx->num_workgroups;
5389       else if (instr->intrinsic == nir_intrinsic_load_work_group_id)
5390          ids = ctx->workgroup_ids;
5391       else
5392          ids = ctx->local_invocation_ids;
5393       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5394                  ids[0].id() ? Operand(ids[0]) : Operand(1u),
5395                  ids[1].id() ? Operand(ids[1]) : Operand(1u),
5396                  ids[2].id() ? Operand(ids[2]) : Operand(1u));
5397       emit_split_vector(ctx, dst, 3);
5398       break;
5399    }
5400    case nir_intrinsic_load_local_invocation_index: {
5401       Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5402                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5403       Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5404       bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5405       break;
5406    }
5407    case nir_intrinsic_load_subgroup_id: {
5408       if (ctx->stage == compute_cs) {
5409          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5410          bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
5411       } else {
5412          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5413       }
5414       break;
5415    }
5416    case nir_intrinsic_load_subgroup_invocation: {
5417       bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5418                bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5419       break;
5420    }
5421    case nir_intrinsic_load_num_subgroups: {
5422       if (ctx->stage == compute_cs)
5423          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
5424       else
5425          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5426       break;
5427    }
5428    case nir_intrinsic_ballot: {
5429       Definition tmp = bld.def(s2);
5430       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5431       if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
5432          bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5433       } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
5434          bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
5435       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5436          bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5437       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5438          bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5439       } else {
5440          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5441          nir_print_instr(&instr->instr, stderr);
5442          fprintf(stderr, "\n");
5443       }
5444       emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5445       break;
5446    }
5447    case nir_intrinsic_shuffle: {
5448       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5449       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5450          emit_uniform_subgroup(ctx, instr, src);
5451       } else {
5452          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5453          assert(tid.regClass() == v1);
5454          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5455          if (src.regClass() == v1) {
5456             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5457             emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
5458          } else if (src.regClass() == v2) {
5459             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5460
5461             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5462             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5463             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
5464             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
5465             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5466             emit_split_vector(ctx, dst, 2);
5467          } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5468             Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5469             tmp = emit_extract_vector(ctx, tmp, 0, v1);
5470             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5471             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5472          } else {
5473             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5474             nir_print_instr(&instr->instr, stderr);
5475             fprintf(stderr, "\n");
5476          }
5477       }
5478       break;
5479    }
5480    case nir_intrinsic_load_sample_id: {
5481       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5482                ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
5483       break;
5484    }
5485    case nir_intrinsic_load_sample_mask_in: {
5486       visit_load_sample_mask_in(ctx, instr);
5487       break;
5488    }
5489    case nir_intrinsic_read_first_invocation: {
5490       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5491       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5492       if (src.regClass() == v1) {
5493          emit_wqm(ctx,
5494                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5495                   dst);
5496       } else if (src.regClass() == v2) {
5497          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5498          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5499          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5500          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5501          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5502          emit_split_vector(ctx, dst, 2);
5503       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5504          emit_wqm(ctx,
5505                   bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5506                            bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
5507                   dst);
5508       } else if (src.regClass() == s1) {
5509          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5510       } else if (src.regClass() == s2) {
5511          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5512       } else {
5513          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5514          nir_print_instr(&instr->instr, stderr);
5515          fprintf(stderr, "\n");
5516       }
5517       break;
5518    }
5519    case nir_intrinsic_read_invocation: {
5520       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5521       Temp lane = get_ssa_temp(ctx, instr->src[1].ssa);
5522       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5523       assert(lane.regClass() == s1);
5524       if (src.regClass() == v1) {
5525          emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst);
5526       } else if (src.regClass() == v2) {
5527          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5528          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5529          lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane));
5530          hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane));
5531          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5532          emit_split_vector(ctx, dst, 2);
5533       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5534          emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst);
5535       } else if (src.regClass() == s1) {
5536          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5537       } else if (src.regClass() == s2) {
5538          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5539       } else {
5540          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5541          nir_print_instr(&instr->instr, stderr);
5542          fprintf(stderr, "\n");
5543       }
5544       break;
5545    }
5546    case nir_intrinsic_vote_all: {
5547       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5548       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5549       assert(src.regClass() == s2);
5550       assert(dst.regClass() == s1);
5551
5552       Definition tmp = bld.def(s1);
5553       bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
5554                bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
5555                Operand(exec, s2));
5556       emit_wqm(ctx, tmp.getTemp(), dst);
5557       break;
5558    }
5559    case nir_intrinsic_vote_any: {
5560       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5561       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5562       assert(src.regClass() == s2);
5563       assert(dst.regClass() == s1);
5564
5565       Definition tmp = bld.def(s1);
5566       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
5567       emit_wqm(ctx, tmp.getTemp(), dst);
5568       break;
5569    }
5570    case nir_intrinsic_reduce:
5571    case nir_intrinsic_inclusive_scan:
5572    case nir_intrinsic_exclusive_scan: {
5573       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5574       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5575       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5576       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5577          nir_intrinsic_cluster_size(instr) : 0;
5578       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5579
5580       if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5581          emit_uniform_subgroup(ctx, instr, src);
5582       } else if (instr->dest.ssa.bit_size == 1) {
5583          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5584             op = nir_op_iand;
5585          else if (op == nir_op_iadd)
5586             op = nir_op_ixor;
5587          else if (op == nir_op_umax || op == nir_op_imax)
5588             op = nir_op_ior;
5589          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5590
5591          switch (instr->intrinsic) {
5592          case nir_intrinsic_reduce:
5593             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5594             break;
5595          case nir_intrinsic_exclusive_scan:
5596             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5597             break;
5598          case nir_intrinsic_inclusive_scan:
5599             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5600             break;
5601          default:
5602             assert(false);
5603          }
5604       } else if (cluster_size == 1) {
5605          bld.copy(Definition(dst), src);
5606       } else {
5607          src = as_vgpr(ctx, src);
5608
5609          ReduceOp reduce_op;
5610          switch (op) {
5611          #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5612             CASE(iadd)
5613             CASE(imul)
5614             CASE(fadd)
5615             CASE(fmul)
5616             CASE(imin)
5617             CASE(umin)
5618             CASE(fmin)
5619             CASE(imax)
5620             CASE(umax)
5621             CASE(fmax)
5622             CASE(iand)
5623             CASE(ior)
5624             CASE(ixor)
5625             default:
5626                unreachable("unknown reduction op");
5627          #undef CASE
5628          }
5629
5630          aco_opcode aco_op;
5631          switch (instr->intrinsic) {
5632             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5633             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5634             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5635             default:
5636                unreachable("unknown reduce intrinsic");
5637          }
5638
5639          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5640          reduce->operands[0] = Operand(src);
5641          // filled in by aco_reduce_assign.cpp, used internally as part of the
5642          // reduce sequence
5643          assert(dst.size() == 1 || dst.size() == 2);
5644          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5645          reduce->operands[2] = Operand(v1.as_linear());
5646
5647          Temp tmp_dst = bld.tmp(dst.regClass());
5648          reduce->definitions[0] = Definition(tmp_dst);
5649          reduce->definitions[1] = bld.def(s2); // used internally
5650          reduce->definitions[2] = Definition();
5651          reduce->definitions[3] = Definition(scc, s1);
5652          reduce->definitions[4] = Definition();
5653          reduce->reduce_op = reduce_op;
5654          reduce->cluster_size = cluster_size;
5655          ctx->block->instructions.emplace_back(std::move(reduce));
5656
5657          emit_wqm(ctx, tmp_dst, dst);
5658       }
5659       break;
5660    }
5661    case nir_intrinsic_quad_broadcast: {
5662       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5663       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5664          emit_uniform_subgroup(ctx, instr, src);
5665       } else {
5666          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5667          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5668          if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5669             uint32_t half_mask = 0x11111111u << lane;
5670             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5671             Temp tmp = bld.tmp(s2);
5672             bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5673                      bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5674                               bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5675             emit_wqm(ctx, tmp, dst);
5676          } else if (instr->dest.ssa.bit_size == 32) {
5677             emit_wqm(ctx,
5678                      bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5679                                   dpp_quad_perm(lane, lane, lane, lane)),
5680                      dst);
5681          } else if (instr->dest.ssa.bit_size == 64) {
5682             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5683             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5684             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5685             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5686             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5687             emit_split_vector(ctx, dst, 2);
5688          } else {
5689             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5690             nir_print_instr(&instr->instr, stderr);
5691             fprintf(stderr, "\n");
5692          }
5693       }
5694       break;
5695    }
5696    case nir_intrinsic_quad_swap_horizontal:
5697    case nir_intrinsic_quad_swap_vertical:
5698    case nir_intrinsic_quad_swap_diagonal:
5699    case nir_intrinsic_quad_swizzle_amd: {
5700       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5701       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5702          emit_uniform_subgroup(ctx, instr, src);
5703          break;
5704       }
5705       uint16_t dpp_ctrl = 0;
5706       switch (instr->intrinsic) {
5707       case nir_intrinsic_quad_swap_horizontal:
5708          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5709          break;
5710       case nir_intrinsic_quad_swap_vertical:
5711          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5712          break;
5713       case nir_intrinsic_quad_swap_diagonal:
5714          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5715          break;
5716       case nir_intrinsic_quad_swizzle_amd: {
5717          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5718          break;
5719       }
5720       default:
5721          break;
5722       }
5723
5724       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5725       if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5726          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5727          src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5728          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5729          emit_wqm(ctx, tmp, dst);
5730       } else if (instr->dest.ssa.bit_size == 32) {
5731          Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5732          emit_wqm(ctx, tmp, dst);
5733       } else if (instr->dest.ssa.bit_size == 64) {
5734          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5735          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5736          lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5737          hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5738          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5739          emit_split_vector(ctx, dst, 2);
5740       } else {
5741          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5742          nir_print_instr(&instr->instr, stderr);
5743          fprintf(stderr, "\n");
5744       }
5745       break;
5746    }
5747    case nir_intrinsic_masked_swizzle_amd: {
5748       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5749       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5750          emit_uniform_subgroup(ctx, instr, src);
5751          break;
5752       }
5753       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5754       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5755       if (dst.regClass() == v1) {
5756          emit_wqm(ctx,
5757                   bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5758                   dst);
5759       } else if (dst.regClass() == v2) {
5760          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5761          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5762          lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5763          hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5764          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5765          emit_split_vector(ctx, dst, 2);
5766       } else {
5767          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5768          nir_print_instr(&instr->instr, stderr);
5769          fprintf(stderr, "\n");
5770       }
5771       break;
5772    }
5773    case nir_intrinsic_write_invocation_amd: {
5774       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5775       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
5776       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
5777       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5778       if (dst.regClass() == v1) {
5779          /* src2 is ignored for writelane. RA assigns the same reg for dst */
5780          emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
5781       } else if (dst.regClass() == v2) {
5782          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
5783          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
5784          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
5785          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
5786          Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
5787          Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
5788          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5789          emit_split_vector(ctx, dst, 2);
5790       } else {
5791          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5792          nir_print_instr(&instr->instr, stderr);
5793          fprintf(stderr, "\n");
5794       }
5795       break;
5796    }
5797    case nir_intrinsic_mbcnt_amd: {
5798       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5799       RegClass rc = RegClass(src.type(), 1);
5800       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
5801       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
5802       Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
5803       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5804       Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
5805       emit_wqm(ctx, wqm_tmp, dst);
5806       break;
5807    }
5808    case nir_intrinsic_load_helper_invocation: {
5809       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5810       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
5811       ctx->block->kind |= block_kind_needs_lowering;
5812       ctx->program->needs_exact = true;
5813       break;
5814    }
5815    case nir_intrinsic_is_helper_invocation: {
5816       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5817       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
5818       ctx->block->kind |= block_kind_needs_lowering;
5819       ctx->program->needs_exact = true;
5820       break;
5821    }
5822    case nir_intrinsic_demote:
5823       bld.pseudo(aco_opcode::p_demote_to_helper);
5824       ctx->block->kind |= block_kind_uses_demote;
5825       ctx->program->needs_exact = true;
5826       break;
5827    case nir_intrinsic_demote_if: {
5828       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
5829                            as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false),
5830                            Operand(exec, s2));
5831       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
5832       ctx->block->kind |= block_kind_uses_demote;
5833       ctx->program->needs_exact = true;
5834       break;
5835    }
5836    case nir_intrinsic_first_invocation: {
5837       emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
5838                get_ssa_temp(ctx, &instr->dest.ssa));
5839       break;
5840    }
5841    case nir_intrinsic_shader_clock:
5842       bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
5843       break;
5844    case nir_intrinsic_load_vertex_id_zero_base: {
5845       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5846       bld.copy(Definition(dst), ctx->vertex_id);
5847       break;
5848    }
5849    case nir_intrinsic_load_first_vertex: {
5850       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5851       bld.copy(Definition(dst), ctx->base_vertex);
5852       break;
5853    }
5854    case nir_intrinsic_load_base_instance: {
5855       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5856       bld.copy(Definition(dst), ctx->start_instance);
5857       break;
5858    }
5859    case nir_intrinsic_load_instance_id: {
5860       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5861       bld.copy(Definition(dst), ctx->instance_id);
5862       break;
5863    }
5864    case nir_intrinsic_load_draw_id: {
5865       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5866       bld.copy(Definition(dst), ctx->draw_id);
5867       break;
5868    }
5869    default:
5870       fprintf(stderr, "Unimplemented intrinsic instr: ");
5871       nir_print_instr(&instr->instr, stderr);
5872       fprintf(stderr, "\n");
5873       abort();
5874
5875       break;
5876    }
5877 }
5878
5879
5880 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
5881                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
5882                     enum glsl_base_type *stype)
5883 {
5884    nir_deref_instr *texture_deref_instr = NULL;
5885    nir_deref_instr *sampler_deref_instr = NULL;
5886    int plane = -1;
5887
5888    for (unsigned i = 0; i < instr->num_srcs; i++) {
5889       switch (instr->src[i].src_type) {
5890       case nir_tex_src_texture_deref:
5891          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
5892          break;
5893       case nir_tex_src_sampler_deref:
5894          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
5895          break;
5896       case nir_tex_src_plane:
5897          plane = nir_src_as_int(instr->src[i].src);
5898          break;
5899       default:
5900          break;
5901       }
5902    }
5903
5904    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
5905
5906    if (!sampler_deref_instr)
5907       sampler_deref_instr = texture_deref_instr;
5908
5909    if (plane >= 0) {
5910       assert(instr->op != nir_texop_txf_ms &&
5911              instr->op != nir_texop_samples_identical);
5912       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
5913       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
5914    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
5915       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
5916    } else {
5917       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
5918    }
5919    if (samp_ptr) {
5920       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
5921       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
5922          fprintf(stderr, "Unimplemented sampler descriptor: ");
5923          nir_print_instr(&instr->instr, stderr);
5924          fprintf(stderr, "\n");
5925          abort();
5926          // TODO: build samp_ptr = and(samp_ptr, res_ptr)
5927       }
5928    }
5929    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
5930                      instr->op == nir_texop_samples_identical))
5931       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
5932 }
5933
5934 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
5935                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
5936 {
5937    Builder bld(ctx->program, ctx->block);
5938
5939    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
5940    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
5941    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
5942
5943    Operand neg_one(0xbf800000u);
5944    Operand one(0x3f800000u);
5945    Operand two(0x40000000u);
5946    Operand four(0x40800000u);
5947
5948    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
5949    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
5950    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
5951
5952    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
5953    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
5954    is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
5955    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
5956
5957    // select sc
5958    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
5959    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
5960                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
5961                        one, is_ma_y);
5962    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5963
5964    // select tc
5965    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
5966    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
5967    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5968
5969    // select ma
5970    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
5971                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
5972                   deriv_z, is_ma_z);
5973    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
5974    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
5975 }
5976
5977 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
5978 {
5979    Builder bld(ctx->program, ctx->block);
5980    Temp coord_args[4], ma, tc, sc, id;
5981    for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
5982       coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
5983
5984    if (is_array) {
5985       coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
5986
5987       // see comment in ac_prepare_cube_coords()
5988       if (ctx->options->chip_class <= GFX8)
5989          coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
5990    }
5991
5992    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
5993
5994    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
5995    vop3a->operands[0] = Operand(ma);
5996    vop3a->abs[0] = true;
5997    Temp invma = bld.tmp(v1);
5998    vop3a->definitions[0] = Definition(invma);
5999    ctx->block->instructions.emplace_back(std::move(vop3a));
6000
6001    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6002    if (!is_deriv)
6003       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
6004
6005    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6006    if (!is_deriv)
6007       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6008
6009    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6010
6011    if (is_deriv) {
6012       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6013       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6014
6015       for (unsigned i = 0; i < 2; i++) {
6016          // see comment in ac_prepare_cube_coords()
6017          Temp deriv_ma;
6018          Temp deriv_sc, deriv_tc;
6019          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6020                            &deriv_ma, &deriv_sc, &deriv_tc);
6021
6022          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6023
6024          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6025                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6026                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6027          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6028                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6029                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6030          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6031       }
6032
6033       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6034       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6035    }
6036
6037    if (is_array)
6038       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6039    *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6040
6041 }
6042
6043 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6044 {
6045    Temp coord_vec[3];
6046    for (unsigned i = 0; i < coords.size(); i++)
6047       coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6048
6049    Builder bld(ctx->program, ctx->block);
6050    coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6051
6052    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6053    for (unsigned i = 0; i < coords.size(); i++)
6054       vec->operands[i] = Operand(coord_vec[i]);
6055    Temp res = bld.tmp(RegType::vgpr, coords.size());
6056    vec->definitions[0] = Definition(res);
6057    ctx->block->instructions.emplace_back(std::move(vec));
6058    return res;
6059 }
6060
6061 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6062 {
6063    if (vec->parent_instr->type != nir_instr_type_alu)
6064       return;
6065    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6066    if (vec_instr->op != nir_op_vec(vec->num_components))
6067       return;
6068
6069    for (unsigned i = 0; i < vec->num_components; i++) {
6070       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6071               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6072    }
6073 }
6074
6075 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6076 {
6077    Builder bld(ctx->program, ctx->block);
6078    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6079         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6080    Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6081         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6082    nir_const_value *sample_index_cv = NULL;
6083    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6084    enum glsl_base_type stype;
6085    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6086
6087    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6088                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6089    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6090                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6091
6092    for (unsigned i = 0; i < instr->num_srcs; i++) {
6093       switch (instr->src[i].src_type) {
6094       case nir_tex_src_coord:
6095          coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6096          break;
6097       case nir_tex_src_bias:
6098          if (instr->op == nir_texop_txb) {
6099             bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6100             has_bias = true;
6101          }
6102          break;
6103       case nir_tex_src_lod: {
6104          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6105
6106          if (val && val->f32 <= 0.0) {
6107             level_zero = true;
6108          } else {
6109             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6110             has_lod = true;
6111          }
6112          break;
6113       }
6114       case nir_tex_src_comparator:
6115          if (instr->is_shadow) {
6116             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6117             has_compare = true;
6118          }
6119          break;
6120       case nir_tex_src_offset:
6121          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6122          get_const_vec(instr->src[i].src.ssa, const_offset);
6123          has_offset = true;
6124          break;
6125       case nir_tex_src_ddx:
6126          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6127          has_ddx = true;
6128          break;
6129       case nir_tex_src_ddy:
6130          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6131          has_ddy = true;
6132          break;
6133       case nir_tex_src_ms_index:
6134          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6135          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6136          has_sample_index = true;
6137          break;
6138       case nir_tex_src_texture_offset:
6139       case nir_tex_src_sampler_offset:
6140       default:
6141          break;
6142       }
6143    }
6144 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6145    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6146       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6147
6148    if (instr->op == nir_texop_texture_samples) {
6149       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6150
6151       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6152       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6153       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6154       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6155
6156       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6157                samples, Operand(1u), bld.scc(is_msaa));
6158       return;
6159    }
6160
6161    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6162       aco_ptr<Instruction> tmp_instr;
6163       Temp acc, pack = Temp();
6164
6165       uint32_t pack_const = 0;
6166       for (unsigned i = 0; i < offset.size(); i++) {
6167          if (!const_offset[i])
6168             continue;
6169          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6170       }
6171
6172       if (offset.type() == RegType::sgpr) {
6173          for (unsigned i = 0; i < offset.size(); i++) {
6174             if (const_offset[i])
6175                continue;
6176
6177             acc = emit_extract_vector(ctx, offset, i, s1);
6178             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6179
6180             if (i) {
6181                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6182             }
6183
6184             if (pack == Temp()) {
6185                pack = acc;
6186             } else {
6187                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6188             }
6189          }
6190
6191          if (pack_const && pack != Temp())
6192             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6193       } else {
6194          for (unsigned i = 0; i < offset.size(); i++) {
6195             if (const_offset[i])
6196                continue;
6197
6198             acc = emit_extract_vector(ctx, offset, i, v1);
6199             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6200
6201             if (i) {
6202                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6203             }
6204
6205             if (pack == Temp()) {
6206                pack = acc;
6207             } else {
6208                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6209             }
6210          }
6211
6212          if (pack_const && pack != Temp())
6213             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6214       }
6215       if (pack_const && pack == Temp())
6216          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6217       else if (pack == Temp())
6218          has_offset = false;
6219       else
6220          offset = pack;
6221    }
6222
6223    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6224       prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6225
6226    /* pack derivatives */
6227    if (has_ddx || has_ddy) {
6228       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) {
6229          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6230                              ddx, Operand(0u), ddy, Operand(0u));
6231       } else {
6232          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6233       }
6234       has_derivs = true;
6235    }
6236
6237    if (instr->coord_components > 1 &&
6238        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6239        instr->is_array &&
6240        instr->op != nir_texop_txf)
6241       coords = apply_round_slice(ctx, coords, 1);
6242
6243    if (instr->coord_components > 2 &&
6244       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6245        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6246        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6247        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6248        instr->is_array &&
6249        instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6250       coords = apply_round_slice(ctx, coords, 2);
6251
6252    if (ctx->options->chip_class == GFX9 &&
6253        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6254        instr->op != nir_texop_lod && instr->coord_components) {
6255       assert(coords.size() > 0 && coords.size() < 3);
6256
6257       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6258       vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6259       vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6260       if (coords.size() > 1)
6261          vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6262       coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6263       vec->definitions[0] = Definition(coords);
6264       ctx->block->instructions.emplace_back(std::move(vec));
6265    }
6266
6267    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6268
6269    if (instr->op == nir_texop_samples_identical)
6270       resource = fmask_ptr;
6271
6272    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6273              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6274             instr->op != nir_texop_txs) {
6275       assert(has_sample_index);
6276       Operand op(sample_index);
6277       if (sample_index_cv)
6278          op = Operand(sample_index_cv->u32);
6279       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6280    }
6281
6282    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6283       Temp split_coords[coords.size()];
6284       emit_split_vector(ctx, coords, coords.size());
6285       for (unsigned i = 0; i < coords.size(); i++)
6286          split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6287
6288       unsigned i = 0;
6289       for (; i < std::min(offset.size(), instr->coord_components); i++) {
6290          Temp off = emit_extract_vector(ctx, offset, i, v1);
6291          split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6292       }
6293
6294       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6295       for (unsigned i = 0; i < coords.size(); i++)
6296          vec->operands[i] = Operand(split_coords[i]);
6297       coords = bld.tmp(coords.regClass());
6298       vec->definitions[0] = Definition(coords);
6299       ctx->block->instructions.emplace_back(std::move(vec));
6300
6301       has_offset = false;
6302    }
6303
6304    /* Build tex instruction */
6305    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6306    unsigned dim = ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF
6307                   ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array)
6308                   : 0;
6309    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6310    Temp tmp_dst = dst;
6311
6312    /* gather4 selects the component by dmask and always returns vec4 */
6313    if (instr->op == nir_texop_tg4) {
6314       assert(instr->dest.ssa.num_components == 4);
6315       if (instr->is_shadow)
6316          dmask = 1;
6317       else
6318          dmask = 1 << instr->component;
6319       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6320          tmp_dst = bld.tmp(v4);
6321    } else if (instr->op == nir_texop_samples_identical) {
6322       tmp_dst = bld.tmp(v1);
6323    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6324       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6325    }
6326
6327    aco_ptr<MIMG_instruction> tex;
6328    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6329       if (!has_lod)
6330          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6331
6332       bool div_by_6 = instr->op == nir_texop_txs &&
6333                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6334                       instr->is_array &&
6335                       (dmask & (1 << 2));
6336       if (tmp_dst.id() == dst.id() && div_by_6)
6337          tmp_dst = bld.tmp(tmp_dst.regClass());
6338
6339       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6340       tex->operands[0] = Operand(as_vgpr(ctx,lod));
6341       tex->operands[1] = Operand(resource);
6342       if (ctx->options->chip_class == GFX9 &&
6343           instr->op == nir_texop_txs &&
6344           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6345           instr->is_array) {
6346          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6347       } else if (instr->op == nir_texop_query_levels) {
6348          tex->dmask = 1 << 3;
6349       } else {
6350          tex->dmask = dmask;
6351       }
6352       tex->da = da;
6353       tex->definitions[0] = Definition(tmp_dst);
6354       tex->dim = dim;
6355       tex->can_reorder = true;
6356       ctx->block->instructions.emplace_back(std::move(tex));
6357
6358       if (div_by_6) {
6359          /* divide 3rd value by 6 by multiplying with magic number */
6360          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6361          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6362          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6363          assert(instr->dest.ssa.num_components == 3);
6364          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6365          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6366                               emit_extract_vector(ctx, tmp_dst, 0, v1),
6367                               emit_extract_vector(ctx, tmp_dst, 1, v1),
6368                               by_6);
6369
6370       }
6371
6372       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6373       return;
6374    }
6375
6376    Temp tg4_compare_cube_wa64 = Temp();
6377
6378    if (tg4_integer_workarounds) {
6379       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6380       tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6381       tex->operands[1] = Operand(resource);
6382       tex->dim = dim;
6383       tex->dmask = 0x3;
6384       tex->da = da;
6385       Temp size = bld.tmp(v2);
6386       tex->definitions[0] = Definition(size);
6387       tex->can_reorder = true;
6388       ctx->block->instructions.emplace_back(std::move(tex));
6389       emit_split_vector(ctx, size, size.size());
6390
6391       Temp half_texel[2];
6392       for (unsigned i = 0; i < 2; i++) {
6393          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6394          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6395          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6396          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6397       }
6398
6399       Temp orig_coords[2] = {
6400          emit_extract_vector(ctx, coords, 0, v1),
6401          emit_extract_vector(ctx, coords, 1, v1)};
6402       Temp new_coords[2] = {
6403          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6404          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6405       };
6406
6407       if (tg4_integer_cube_workaround) {
6408          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6409          Temp desc[resource.size()];
6410          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6411                                                                            Format::PSEUDO, 1, resource.size())};
6412          split->operands[0] = Operand(resource);
6413          for (unsigned i = 0; i < resource.size(); i++) {
6414             desc[i] = bld.tmp(s1);
6415             split->definitions[i] = Definition(desc[i]);
6416          }
6417          ctx->block->instructions.emplace_back(std::move(split));
6418
6419          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6420          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6421                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6422
6423          Temp nfmt;
6424          if (stype == GLSL_TYPE_UINT) {
6425             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6426                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6427                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6428                             bld.scc(compare_cube_wa));
6429          } else {
6430             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6431                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6432                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6433                             bld.scc(compare_cube_wa));
6434          }
6435          tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true);
6436          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6437
6438          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6439                             Operand((uint32_t)C_008F14_NUM_FORMAT));
6440          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6441
6442          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6443                                                                          Format::PSEUDO, resource.size(), 1)};
6444          for (unsigned i = 0; i < resource.size(); i++)
6445             vec->operands[i] = Operand(desc[i]);
6446          resource = bld.tmp(resource.regClass());
6447          vec->definitions[0] = Definition(resource);
6448          ctx->block->instructions.emplace_back(std::move(vec));
6449
6450          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6451                                   new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6452          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6453                                   new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6454       }
6455
6456       if (coords.size() == 3) {
6457          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6458                              new_coords[0], new_coords[1],
6459                              emit_extract_vector(ctx, coords, 2, v1));
6460       } else {
6461          assert(coords.size() == 2);
6462          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6463                              new_coords[0], new_coords[1]);
6464       }
6465    }
6466
6467    if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6468        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6469        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6470       coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
6471
6472    std::vector<Operand> args;
6473    if (has_offset)
6474       args.emplace_back(Operand(offset));
6475    if (has_bias)
6476       args.emplace_back(Operand(bias));
6477    if (has_compare)
6478       args.emplace_back(Operand(compare));
6479    if (has_derivs)
6480       args.emplace_back(Operand(derivs));
6481    args.emplace_back(Operand(coords));
6482    if (has_sample_index)
6483       args.emplace_back(Operand(sample_index));
6484    if (has_lod)
6485       args.emplace_back(lod);
6486
6487    Operand arg;
6488    if (args.size() > 1) {
6489       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6490       unsigned size = 0;
6491       for (unsigned i = 0; i < args.size(); i++) {
6492          size += args[i].size();
6493          vec->operands[i] = args[i];
6494       }
6495       RegClass rc = RegClass(RegType::vgpr, size);
6496       Temp tmp = bld.tmp(rc);
6497       vec->definitions[0] = Definition(tmp);
6498       ctx->block->instructions.emplace_back(std::move(vec));
6499       arg = Operand(tmp);
6500    } else {
6501       assert(args[0].isTemp());
6502       arg = Operand(as_vgpr(ctx, args[0].getTemp()));
6503    }
6504
6505    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6506       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6507
6508       assert(coords.size() == 1);
6509       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6510       aco_opcode op;
6511       switch (last_bit) {
6512       case 1:
6513          op = aco_opcode::buffer_load_format_x; break;
6514       case 2:
6515          op = aco_opcode::buffer_load_format_xy; break;
6516       case 3:
6517          op = aco_opcode::buffer_load_format_xyz; break;
6518       case 4:
6519          op = aco_opcode::buffer_load_format_xyzw; break;
6520       default:
6521          unreachable("Tex instruction loads more than 4 components.");
6522       }
6523
6524       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6525       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6526          tmp_dst = dst;
6527       else
6528          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6529
6530       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6531       mubuf->operands[0] = Operand(coords);
6532       mubuf->operands[1] = Operand(resource);
6533       mubuf->operands[2] = Operand((uint32_t) 0);
6534       mubuf->definitions[0] = Definition(tmp_dst);
6535       mubuf->idxen = true;
6536       mubuf->can_reorder = true;
6537       ctx->block->instructions.emplace_back(std::move(mubuf));
6538
6539       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6540       return;
6541    }
6542
6543
6544    if (instr->op == nir_texop_txf ||
6545        instr->op == nir_texop_txf_ms ||
6546        instr->op == nir_texop_samples_identical) {
6547       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6548       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6549       tex->operands[0] = Operand(arg);
6550       tex->operands[1] = Operand(resource);
6551       tex->dim = dim;
6552       tex->dmask = dmask;
6553       tex->unrm = true;
6554       tex->da = da;
6555       tex->definitions[0] = Definition(tmp_dst);
6556       tex->can_reorder = true;
6557       ctx->block->instructions.emplace_back(std::move(tex));
6558
6559       if (instr->op == nir_texop_samples_identical) {
6560          assert(dmask == 1 && dst.regClass() == v1);
6561          assert(dst.id() != tmp_dst.id());
6562
6563          Temp tmp = bld.tmp(s2);
6564          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6565          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6566
6567       } else {
6568          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6569       }
6570       return;
6571    }
6572
6573    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6574    aco_opcode opcode = aco_opcode::image_sample;
6575    if (has_offset) { /* image_sample_*_o */
6576       if (has_compare) {
6577          opcode = aco_opcode::image_sample_c_o;
6578          if (has_derivs)
6579             opcode = aco_opcode::image_sample_c_d_o;
6580          if (has_bias)
6581             opcode = aco_opcode::image_sample_c_b_o;
6582          if (level_zero)
6583             opcode = aco_opcode::image_sample_c_lz_o;
6584          if (has_lod)
6585             opcode = aco_opcode::image_sample_c_l_o;
6586       } else {
6587          opcode = aco_opcode::image_sample_o;
6588          if (has_derivs)
6589             opcode = aco_opcode::image_sample_d_o;
6590          if (has_bias)
6591             opcode = aco_opcode::image_sample_b_o;
6592          if (level_zero)
6593             opcode = aco_opcode::image_sample_lz_o;
6594          if (has_lod)
6595             opcode = aco_opcode::image_sample_l_o;
6596       }
6597    } else { /* no offset */
6598       if (has_compare) {
6599          opcode = aco_opcode::image_sample_c;
6600          if (has_derivs)
6601             opcode = aco_opcode::image_sample_c_d;
6602          if (has_bias)
6603             opcode = aco_opcode::image_sample_c_b;
6604          if (level_zero)
6605             opcode = aco_opcode::image_sample_c_lz;
6606          if (has_lod)
6607             opcode = aco_opcode::image_sample_c_l;
6608       } else {
6609          opcode = aco_opcode::image_sample;
6610          if (has_derivs)
6611             opcode = aco_opcode::image_sample_d;
6612          if (has_bias)
6613             opcode = aco_opcode::image_sample_b;
6614          if (level_zero)
6615             opcode = aco_opcode::image_sample_lz;
6616          if (has_lod)
6617             opcode = aco_opcode::image_sample_l;
6618       }
6619    }
6620
6621    if (instr->op == nir_texop_tg4) {
6622       if (has_offset) {
6623          opcode = aco_opcode::image_gather4_lz_o;
6624          if (has_compare)
6625             opcode = aco_opcode::image_gather4_c_lz_o;
6626       } else {
6627          opcode = aco_opcode::image_gather4_lz;
6628          if (has_compare)
6629             opcode = aco_opcode::image_gather4_c_lz;
6630       }
6631    } else if (instr->op == nir_texop_lod) {
6632       opcode = aco_opcode::image_get_lod;
6633    }
6634
6635    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6636    tex->operands[0] = arg;
6637    tex->operands[1] = Operand(resource);
6638    tex->operands[2] = Operand(sampler);
6639    tex->dim = dim;
6640    tex->dmask = dmask;
6641    tex->da = da;
6642    tex->definitions[0] = Definition(tmp_dst);
6643    tex->can_reorder = true;
6644    ctx->block->instructions.emplace_back(std::move(tex));
6645
6646    if (tg4_integer_cube_workaround) {
6647       assert(tmp_dst.id() != dst.id());
6648       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6649
6650       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6651       Temp val[4];
6652       for (unsigned i = 0; i < dst.size(); i++) {
6653          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6654          Temp cvt_val;
6655          if (stype == GLSL_TYPE_UINT)
6656             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6657          else
6658             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6659          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6660       }
6661       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6662       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6663                            val[0], val[1], val[2], val[3]);
6664    }
6665    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6666    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6667
6668 }
6669
6670
6671 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6672 {
6673    Temp tmp = get_ssa_temp(ctx, ssa);
6674    if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6675       return Operand(tmp.regClass());
6676    else
6677       return Operand(tmp);
6678 }
6679
6680 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6681 {
6682    aco_ptr<Pseudo_instruction> phi;
6683    unsigned num_src = exec_list_length(&instr->srcs);
6684    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6685
6686    aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6687
6688    std::map<unsigned, nir_ssa_def*> phi_src;
6689    bool all_undef = true;
6690    nir_foreach_phi_src(src, instr) {
6691       phi_src[src->pred->index] = src->src.ssa;
6692       if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef)
6693          all_undef = false;
6694    }
6695    if (all_undef) {
6696       Builder bld(ctx->program, ctx->block);
6697       if (dst.regClass() == s1) {
6698          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6699       } else if (dst.regClass() == v1) {
6700          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6701       } else {
6702          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6703          for (unsigned i = 0; i < dst.size(); i++)
6704             vec->operands[i] = Operand(0u);
6705          vec->definitions[0] = Definition(dst);
6706          ctx->block->instructions.emplace_back(std::move(vec));
6707       }
6708       return;
6709    }
6710
6711    /* try to scalarize vector phis */
6712    if (dst.size() > 1) {
6713       // TODO: scalarize linear phis on divergent ifs
6714       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
6715       std::array<Temp, 4> new_vec;
6716       for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) {
6717          Operand src = get_phi_operand(ctx, pair.second);
6718          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) {
6719             can_scalarize = false;
6720             break;
6721          }
6722       }
6723       if (can_scalarize) {
6724          unsigned num_components = instr->dest.ssa.num_components;
6725          assert(dst.size() % num_components == 0);
6726          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
6727
6728          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
6729          for (unsigned k = 0; k < num_components; k++) {
6730             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1));
6731             std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6732             for (unsigned i = 0; i < num_src; i++) {
6733                Operand src = get_phi_operand(ctx, it->second);
6734                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
6735                ++it;
6736             }
6737             Temp phi_dst = {ctx->program->allocateId(), rc};
6738             phi->definitions[0] = Definition(phi_dst);
6739             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6740             new_vec[k] = phi_dst;
6741             vec->operands[k] = Operand(phi_dst);
6742          }
6743          vec->definitions[0] = Definition(dst);
6744          ctx->block->instructions.emplace_back(std::move(vec));
6745          ctx->allocated_vec.emplace(dst.id(), new_vec);
6746          return;
6747       }
6748    }
6749
6750    unsigned extra_src = 0;
6751    if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) &&
6752        ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) {
6753       extra_src++;
6754    }
6755
6756    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1));
6757
6758    /* if we have a linear phi on a divergent if, we know that one src is undef */
6759    if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) {
6760       assert(extra_src == 0);
6761       Block* block;
6762       /* we place the phi either in the invert-block or in the current block */
6763       if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) {
6764          assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef);
6765          Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]];
6766          block = &ctx->program->blocks[linear_else.linear_preds[0]];
6767          assert(block->kind & block_kind_invert);
6768          phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second);
6769       } else {
6770          assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef);
6771          block = ctx->block;
6772          phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second);
6773       }
6774       phi->operands[1] = Operand(dst.regClass());
6775       phi->definitions[0] = Definition(dst);
6776       block->instructions.emplace(block->instructions.begin(), std::move(phi));
6777       return;
6778    }
6779
6780    std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6781    for (unsigned i = 0; i < num_src; i++) {
6782       phi->operands[i] = get_phi_operand(ctx, it->second);
6783       ++it;
6784    }
6785    for (unsigned i = 0; i < extra_src; i++)
6786       phi->operands[num_src + i] = Operand(dst.regClass());
6787    phi->definitions[0] = Definition(dst);
6788    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6789 }
6790
6791
6792 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
6793 {
6794    Temp dst = get_ssa_temp(ctx, &instr->def);
6795
6796    assert(dst.type() == RegType::sgpr);
6797
6798    if (dst.size() == 1) {
6799       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
6800    } else {
6801       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6802       for (unsigned i = 0; i < dst.size(); i++)
6803          vec->operands[i] = Operand(0u);
6804       vec->definitions[0] = Definition(dst);
6805       ctx->block->instructions.emplace_back(std::move(vec));
6806    }
6807 }
6808
6809 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
6810 {
6811    Builder bld(ctx->program, ctx->block);
6812    Block *logical_target;
6813    append_logical_end(ctx->block);
6814    unsigned idx = ctx->block->index;
6815
6816    switch (instr->type) {
6817    case nir_jump_break:
6818       logical_target = ctx->cf_info.parent_loop.exit;
6819       add_logical_edge(idx, logical_target);
6820       ctx->block->kind |= block_kind_break;
6821
6822       if (!ctx->cf_info.parent_if.is_divergent &&
6823           !ctx->cf_info.parent_loop.has_divergent_continue) {
6824          /* uniform break - directly jump out of the loop */
6825          ctx->block->kind |= block_kind_uniform;
6826          ctx->cf_info.has_branch = true;
6827          bld.branch(aco_opcode::p_branch);
6828          add_linear_edge(idx, logical_target);
6829          return;
6830       }
6831       ctx->cf_info.parent_loop.has_divergent_branch = true;
6832       break;
6833    case nir_jump_continue:
6834       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6835       add_logical_edge(idx, logical_target);
6836       ctx->block->kind |= block_kind_continue;
6837
6838       if (ctx->cf_info.parent_if.is_divergent) {
6839          /* for potential uniform breaks after this continue,
6840             we must ensure that they are handled correctly */
6841          ctx->cf_info.parent_loop.has_divergent_continue = true;
6842          ctx->cf_info.parent_loop.has_divergent_branch = true;
6843       } else {
6844          /* uniform continue - directly jump to the loop header */
6845          ctx->block->kind |= block_kind_uniform;
6846          ctx->cf_info.has_branch = true;
6847          bld.branch(aco_opcode::p_branch);
6848          add_linear_edge(idx, logical_target);
6849          return;
6850       }
6851       break;
6852    default:
6853       fprintf(stderr, "Unknown NIR jump instr: ");
6854       nir_print_instr(&instr->instr, stderr);
6855       fprintf(stderr, "\n");
6856       abort();
6857    }
6858
6859    /* remove critical edges from linear CFG */
6860    bld.branch(aco_opcode::p_branch);
6861    Block* break_block = ctx->program->create_and_insert_block();
6862    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6863    break_block->kind |= block_kind_uniform;
6864    add_linear_edge(idx, break_block);
6865    /* the loop_header pointer might be invalidated by this point */
6866    if (instr->type == nir_jump_continue)
6867       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6868    add_linear_edge(break_block->index, logical_target);
6869    bld.reset(break_block);
6870    bld.branch(aco_opcode::p_branch);
6871
6872    Block* continue_block = ctx->program->create_and_insert_block();
6873    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6874    add_linear_edge(idx, continue_block);
6875    append_logical_start(continue_block);
6876    ctx->block = continue_block;
6877    return;
6878 }
6879
6880 void visit_block(isel_context *ctx, nir_block *block)
6881 {
6882    nir_foreach_instr(instr, block) {
6883       switch (instr->type) {
6884       case nir_instr_type_alu:
6885          visit_alu_instr(ctx, nir_instr_as_alu(instr));
6886          break;
6887       case nir_instr_type_load_const:
6888          visit_load_const(ctx, nir_instr_as_load_const(instr));
6889          break;
6890       case nir_instr_type_intrinsic:
6891          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
6892          break;
6893       case nir_instr_type_tex:
6894          visit_tex(ctx, nir_instr_as_tex(instr));
6895          break;
6896       case nir_instr_type_phi:
6897          visit_phi(ctx, nir_instr_as_phi(instr));
6898          break;
6899       case nir_instr_type_ssa_undef:
6900          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
6901          break;
6902       case nir_instr_type_deref:
6903          break;
6904       case nir_instr_type_jump:
6905          visit_jump(ctx, nir_instr_as_jump(instr));
6906          break;
6907       default:
6908          fprintf(stderr, "Unknown NIR instr type: ");
6909          nir_print_instr(instr, stderr);
6910          fprintf(stderr, "\n");
6911          //abort();
6912       }
6913    }
6914 }
6915
6916
6917
6918 static void visit_loop(isel_context *ctx, nir_loop *loop)
6919 {
6920    append_logical_end(ctx->block);
6921    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
6922    Builder bld(ctx->program, ctx->block);
6923    bld.branch(aco_opcode::p_branch);
6924    unsigned loop_preheader_idx = ctx->block->index;
6925
6926    Block loop_exit = Block();
6927    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
6928    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
6929
6930    Block* loop_header = ctx->program->create_and_insert_block();
6931    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
6932    loop_header->kind |= block_kind_loop_header;
6933    add_edge(loop_preheader_idx, loop_header);
6934    ctx->block = loop_header;
6935
6936    /* emit loop body */
6937    unsigned loop_header_idx = loop_header->index;
6938    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
6939    append_logical_start(ctx->block);
6940    visit_cf_list(ctx, &loop->body);
6941
6942    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
6943    if (!ctx->cf_info.has_branch) {
6944       append_logical_end(ctx->block);
6945       if (ctx->cf_info.exec_potentially_empty) {
6946          /* Discards can result in code running with an empty exec mask.
6947           * This would result in divergent breaks not ever being taken. As a
6948           * workaround, break the loop when the loop mask is empty instead of
6949           * always continuing. */
6950          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
6951
6952          /* create "loop_almost_exit" to avoid critical edges */
6953          unsigned block_idx = ctx->block->index;
6954          Block *loop_almost_exit = ctx->program->create_and_insert_block();
6955          loop_almost_exit->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6956          loop_almost_exit->kind = block_kind_uniform;
6957          bld.reset(loop_almost_exit);
6958          bld.branch(aco_opcode::p_branch);
6959
6960          add_linear_edge(block_idx, loop_almost_exit);
6961          add_linear_edge(loop_almost_exit->index, &loop_exit);
6962
6963          ctx->block = &ctx->program->blocks[block_idx];
6964       } else {
6965          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
6966       }
6967       if (!ctx->cf_info.parent_loop.has_divergent_branch)
6968          add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6969       else
6970          add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6971       bld.reset(ctx->block);
6972       bld.branch(aco_opcode::p_branch);
6973    }
6974
6975    /* fixup phis in loop header from unreachable blocks */
6976    if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
6977       bool linear = ctx->cf_info.has_branch;
6978       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
6979       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
6980          if ((logical && instr->opcode == aco_opcode::p_phi) ||
6981              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
6982             /* the last operand should be the one that needs to be removed */
6983             instr->operands.pop_back();
6984          } else if (!is_phi(instr)) {
6985             break;
6986          }
6987       }
6988    }
6989
6990    ctx->cf_info.has_branch = false;
6991
6992    // TODO: if the loop has not a single exit, we must add one °°
6993    /* emit loop successor block */
6994    ctx->block = ctx->program->insert_block(std::move(loop_exit));
6995    append_logical_start(ctx->block);
6996
6997    #if 0
6998    // TODO: check if it is beneficial to not branch on continues
6999    /* trim linear phis in loop header */
7000    for (auto&& instr : loop_entry->instructions) {
7001       if (instr->opcode == aco_opcode::p_linear_phi) {
7002          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
7003          new_phi->definitions[0] = instr->definitions[0];
7004          for (unsigned i = 0; i < new_phi->operands.size(); i++)
7005             new_phi->operands[i] = instr->operands[i];
7006          /* check that the remaining operands are all the same */
7007          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
7008             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
7009          instr.swap(new_phi);
7010       } else if (instr->opcode == aco_opcode::p_phi) {
7011          continue;
7012       } else {
7013          break;
7014       }
7015    }
7016    #endif
7017 }
7018
7019 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7020 {
7021    ic->cond = cond;
7022
7023    append_logical_end(ctx->block);
7024    ctx->block->kind |= block_kind_branch;
7025
7026    /* branch to linear then block */
7027    assert(cond.regClass() == s2);
7028    aco_ptr<Pseudo_branch_instruction> branch;
7029    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7030    branch->operands[0] = Operand(cond);
7031    ctx->block->instructions.push_back(std::move(branch));
7032
7033    ic->BB_if_idx = ctx->block->index;
7034    ic->BB_invert = Block();
7035    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7036    /* Invert blocks are intentionally not marked as top level because they
7037     * are not part of the logical cfg. */
7038    ic->BB_invert.kind |= block_kind_invert;
7039    ic->BB_endif = Block();
7040    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7041    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7042
7043    ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7044    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7045    ctx->cf_info.parent_if.is_divergent = true;
7046    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7047
7048    /** emit logical then block */
7049    Block* BB_then_logical = ctx->program->create_and_insert_block();
7050    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7051    add_edge(ic->BB_if_idx, BB_then_logical);
7052    ctx->block = BB_then_logical;
7053    append_logical_start(BB_then_logical);
7054 }
7055
7056 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7057 {
7058    Block *BB_then_logical = ctx->block;
7059    append_logical_end(BB_then_logical);
7060     /* branch from logical then block to invert block */
7061    aco_ptr<Pseudo_branch_instruction> branch;
7062    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7063    BB_then_logical->instructions.emplace_back(std::move(branch));
7064    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7065    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7066       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7067    BB_then_logical->kind |= block_kind_uniform;
7068    assert(!ctx->cf_info.has_branch);
7069    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7070    ctx->cf_info.parent_loop.has_divergent_branch = false;
7071
7072    /** emit linear then block */
7073    Block* BB_then_linear = ctx->program->create_and_insert_block();
7074    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7075    BB_then_linear->kind |= block_kind_uniform;
7076    add_linear_edge(ic->BB_if_idx, BB_then_linear);
7077    /* branch from linear then block to invert block */
7078    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7079    BB_then_linear->instructions.emplace_back(std::move(branch));
7080    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7081
7082    /** emit invert merge block */
7083    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7084    ic->invert_idx = ctx->block->index;
7085
7086    /* branch to linear else block (skip else) */
7087    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7088    branch->operands[0] = Operand(ic->cond);
7089    ctx->block->instructions.push_back(std::move(branch));
7090
7091    ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7092    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7093
7094    /** emit logical else block */
7095    Block* BB_else_logical = ctx->program->create_and_insert_block();
7096    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7097    add_logical_edge(ic->BB_if_idx, BB_else_logical);
7098    add_linear_edge(ic->invert_idx, BB_else_logical);
7099    ctx->block = BB_else_logical;
7100    append_logical_start(BB_else_logical);
7101 }
7102
7103 static void end_divergent_if(isel_context *ctx, if_context *ic)
7104 {
7105    Block *BB_else_logical = ctx->block;
7106    append_logical_end(BB_else_logical);
7107
7108    /* branch from logical else block to endif block */
7109    aco_ptr<Pseudo_branch_instruction> branch;
7110    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7111    BB_else_logical->instructions.emplace_back(std::move(branch));
7112    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7113    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7114       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7115    BB_else_logical->kind |= block_kind_uniform;
7116
7117    assert(!ctx->cf_info.has_branch);
7118    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7119
7120
7121    /** emit linear else block */
7122    Block* BB_else_linear = ctx->program->create_and_insert_block();
7123    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7124    BB_else_linear->kind |= block_kind_uniform;
7125    add_linear_edge(ic->invert_idx, BB_else_linear);
7126
7127    /* branch from linear else block to endif block */
7128    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7129    BB_else_linear->instructions.emplace_back(std::move(branch));
7130    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7131
7132
7133    /** emit endif merge block */
7134    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7135    append_logical_start(ctx->block);
7136
7137
7138    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7139    ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7140    /* uniform control flow never has an empty exec-mask */
7141    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7142       ctx->cf_info.exec_potentially_empty = false;
7143 }
7144
7145 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7146 {
7147    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7148    Builder bld(ctx->program, ctx->block);
7149    aco_ptr<Pseudo_branch_instruction> branch;
7150
7151    if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7152       /**
7153        * Uniform conditionals are represented in the following way*) :
7154        *
7155        * The linear and logical CFG:
7156        *                        BB_IF
7157        *                        /    \
7158        *       BB_THEN (logical)      BB_ELSE (logical)
7159        *                        \    /
7160        *                        BB_ENDIF
7161        *
7162        * *) Exceptions may be due to break and continue statements within loops
7163        *    If a break/continue happens within uniform control flow, it branches
7164        *    to the loop exit/entry block. Otherwise, it branches to the next
7165        *    merge block.
7166        **/
7167       append_logical_end(ctx->block);
7168       ctx->block->kind |= block_kind_uniform;
7169
7170       /* emit branch */
7171       if (cond.regClass() == s2) {
7172          // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7173          cond = as_uniform_bool(ctx, cond);
7174       }
7175       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7176       branch->operands[0] = Operand(cond);
7177       branch->operands[0].setFixed(scc);
7178       ctx->block->instructions.emplace_back(std::move(branch));
7179
7180       unsigned BB_if_idx = ctx->block->index;
7181       Block BB_endif = Block();
7182       BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7183       BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7184
7185       /** emit then block */
7186       Block* BB_then = ctx->program->create_and_insert_block();
7187       BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7188       add_edge(BB_if_idx, BB_then);
7189       append_logical_start(BB_then);
7190       ctx->block = BB_then;
7191       visit_cf_list(ctx, &if_stmt->then_list);
7192       BB_then = ctx->block;
7193       bool then_branch = ctx->cf_info.has_branch;
7194       bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7195
7196       if (!then_branch) {
7197          append_logical_end(BB_then);
7198          /* branch from then block to endif block */
7199          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7200          BB_then->instructions.emplace_back(std::move(branch));
7201          add_linear_edge(BB_then->index, &BB_endif);
7202          if (!then_branch_divergent)
7203             add_logical_edge(BB_then->index, &BB_endif);
7204          BB_then->kind |= block_kind_uniform;
7205       }
7206
7207       ctx->cf_info.has_branch = false;
7208       ctx->cf_info.parent_loop.has_divergent_branch = false;
7209
7210       /** emit else block */
7211       Block* BB_else = ctx->program->create_and_insert_block();
7212       BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7213       add_edge(BB_if_idx, BB_else);
7214       append_logical_start(BB_else);
7215       ctx->block = BB_else;
7216       visit_cf_list(ctx, &if_stmt->else_list);
7217       BB_else = ctx->block;
7218
7219       if (!ctx->cf_info.has_branch) {
7220          append_logical_end(BB_else);
7221          /* branch from then block to endif block */
7222          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7223          BB_else->instructions.emplace_back(std::move(branch));
7224          add_linear_edge(BB_else->index, &BB_endif);
7225          if (!ctx->cf_info.parent_loop.has_divergent_branch)
7226             add_logical_edge(BB_else->index, &BB_endif);
7227          BB_else->kind |= block_kind_uniform;
7228       }
7229
7230       ctx->cf_info.has_branch &= then_branch;
7231       ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7232
7233       /** emit endif merge block */
7234       if (!ctx->cf_info.has_branch) {
7235          ctx->block = ctx->program->insert_block(std::move(BB_endif));
7236          append_logical_start(ctx->block);
7237       }
7238    } else { /* non-uniform condition */
7239       /**
7240        * To maintain a logical and linear CFG without critical edges,
7241        * non-uniform conditionals are represented in the following way*) :
7242        *
7243        * The linear CFG:
7244        *                        BB_IF
7245        *                        /    \
7246        *       BB_THEN (logical)      BB_THEN (linear)
7247        *                        \    /
7248        *                        BB_INVERT (linear)
7249        *                        /    \
7250        *       BB_ELSE (logical)      BB_ELSE (linear)
7251        *                        \    /
7252        *                        BB_ENDIF
7253        *
7254        * The logical CFG:
7255        *                        BB_IF
7256        *                        /    \
7257        *       BB_THEN (logical)      BB_ELSE (logical)
7258        *                        \    /
7259        *                        BB_ENDIF
7260        *
7261        * *) Exceptions may be due to break and continue statements within loops
7262        **/
7263
7264       if_context ic;
7265
7266       begin_divergent_if_then(ctx, &ic, cond);
7267       visit_cf_list(ctx, &if_stmt->then_list);
7268
7269       begin_divergent_if_else(ctx, &ic);
7270       visit_cf_list(ctx, &if_stmt->else_list);
7271
7272       end_divergent_if(ctx, &ic);
7273    }
7274 }
7275
7276 static void visit_cf_list(isel_context *ctx,
7277                           struct exec_list *list)
7278 {
7279    foreach_list_typed(nir_cf_node, node, node, list) {
7280       switch (node->type) {
7281       case nir_cf_node_block:
7282          visit_block(ctx, nir_cf_node_as_block(node));
7283          break;
7284       case nir_cf_node_if:
7285          visit_if(ctx, nir_cf_node_as_if(node));
7286          break;
7287       case nir_cf_node_loop:
7288          visit_loop(ctx, nir_cf_node_as_loop(node));
7289          break;
7290       default:
7291          unreachable("unimplemented cf list type");
7292       }
7293    }
7294 }
7295
7296 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7297 {
7298    int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7299    uint64_t mask = ctx->vs_output.mask[slot];
7300    if (!is_pos && !mask)
7301       return;
7302    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7303       return;
7304    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7305    exp->enabled_mask = mask;
7306    for (unsigned i = 0; i < 4; ++i) {
7307       if (mask & (1 << i))
7308          exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7309       else
7310          exp->operands[i] = Operand(v1);
7311    }
7312    exp->valid_mask = false;
7313    exp->done = false;
7314    exp->compressed = false;
7315    if (is_pos)
7316       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7317    else
7318       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7319    ctx->block->instructions.emplace_back(std::move(exp));
7320 }
7321
7322 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7323 {
7324    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7325    exp->enabled_mask = 0;
7326    for (unsigned i = 0; i < 4; ++i)
7327       exp->operands[i] = Operand(v1);
7328    if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7329       exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7330       exp->enabled_mask |= 0x1;
7331    }
7332    if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7333       exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7334       exp->enabled_mask |= 0x4;
7335    }
7336    if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7337       if (ctx->options->chip_class < GFX9) {
7338          exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7339          exp->enabled_mask |= 0x8;
7340       } else {
7341          Builder bld(ctx->program, ctx->block);
7342
7343          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7344                              Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7345          if (exp->operands[2].isTemp())
7346             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7347
7348          exp->operands[2] = Operand(out);
7349          exp->enabled_mask |= 0x4;
7350       }
7351    }
7352    exp->valid_mask = false;
7353    exp->done = false;
7354    exp->compressed = false;
7355    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7356    ctx->block->instructions.emplace_back(std::move(exp));
7357 }
7358
7359 static void create_vs_exports(isel_context *ctx)
7360 {
7361    radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7362
7363    if (outinfo->export_prim_id) {
7364       ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7365       ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id;
7366    }
7367
7368    if (ctx->options->key.has_multiview_view_index) {
7369       ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7370       ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index);
7371    }
7372
7373    /* the order these position exports are created is important */
7374    int next_pos = 0;
7375    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7376    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7377       export_vs_psiz_layer_viewport(ctx, &next_pos);
7378    }
7379    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7380       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7381    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7382       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7383
7384    if (ctx->options->key.vs_common_out.export_clip_dists) {
7385       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7386          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7387       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7388          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7389    }
7390
7391    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7392       if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7393           i != VARYING_SLOT_PRIMITIVE_ID)
7394          continue;
7395
7396       export_vs_varying(ctx, i, false, NULL);
7397    }
7398 }
7399
7400 static void emit_stream_output(isel_context *ctx,
7401                                Temp const *so_buffers,
7402                                Temp const *so_write_offset,
7403                                const struct radv_stream_output *output)
7404 {
7405    unsigned num_comps = util_bitcount(output->component_mask);
7406    unsigned loc = output->location;
7407    unsigned buf = output->buffer;
7408    unsigned offset = output->offset;
7409
7410    assert(num_comps && num_comps <= 4);
7411    if (!num_comps || num_comps > 4)
7412       return;
7413
7414    unsigned start = ffs(output->component_mask) - 1;
7415
7416    Temp out[4];
7417    bool all_undef = true;
7418    assert(ctx->stage == vertex_vs);
7419    for (unsigned i = 0; i < num_comps; i++) {
7420       out[i] = ctx->vs_output.outputs[loc][start + i];
7421       all_undef = all_undef && !out[i].id();
7422    }
7423    if (all_undef)
7424       return;
7425
7426    Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7427    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7428    for (unsigned i = 0; i < num_comps; ++i)
7429       vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7430    vec->definitions[0] = Definition(write_data);
7431    ctx->block->instructions.emplace_back(std::move(vec));
7432
7433    aco_opcode opcode;
7434    switch (num_comps) {
7435    case 1:
7436       opcode = aco_opcode::buffer_store_dword;
7437       break;
7438    case 2:
7439       opcode = aco_opcode::buffer_store_dwordx2;
7440       break;
7441    case 3:
7442       opcode = aco_opcode::buffer_store_dwordx3;
7443       break;
7444    case 4:
7445       opcode = aco_opcode::buffer_store_dwordx4;
7446       break;
7447    }
7448
7449    aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7450    store->operands[0] = Operand(so_write_offset[buf]);
7451    store->operands[1] = Operand(so_buffers[buf]);
7452    store->operands[2] = Operand((uint32_t) 0);
7453    store->operands[3] = Operand(write_data);
7454    if (offset > 4095) {
7455       /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7456       Builder bld(ctx->program, ctx->block);
7457       store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7458    } else {
7459       store->offset = offset;
7460    }
7461    store->offen = true;
7462    store->glc = true;
7463    store->dlc = false;
7464    store->slc = true;
7465    store->can_reorder = true;
7466    ctx->block->instructions.emplace_back(std::move(store));
7467 }
7468
7469 static void emit_streamout(isel_context *ctx, unsigned stream)
7470 {
7471    Builder bld(ctx->program, ctx->block);
7472
7473    Temp so_buffers[4];
7474    Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers);
7475    for (unsigned i = 0; i < 4; i++) {
7476       unsigned stride = ctx->program->info->so.strides[i];
7477       if (!stride)
7478          continue;
7479
7480       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7481    }
7482
7483    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7484                                 ctx->streamout_config, Operand(0x70010u));
7485
7486    Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7487                        bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7488
7489    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7490
7491    if_context ic;
7492    begin_divergent_if_then(ctx, &ic, can_emit);
7493
7494    bld.reset(ctx->block);
7495
7496    Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid);
7497
7498    Temp so_write_offset[4];
7499
7500    for (unsigned i = 0; i < 4; i++) {
7501       unsigned stride = ctx->program->info->so.strides[i];
7502       if (!stride)
7503          continue;
7504
7505       if (stride == 1) {
7506          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7507                                 ctx->streamout_write_idx, ctx->streamout_offset[i]);
7508          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7509
7510          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7511       } else {
7512          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7513          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]);
7514          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7515       }
7516    }
7517
7518    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7519       struct radv_stream_output *output =
7520          &ctx->program->info->so.outputs[i];
7521       if (stream != output->stream)
7522          continue;
7523
7524       emit_stream_output(ctx, so_buffers, so_write_offset, output);
7525    }
7526
7527    begin_divergent_if_else(ctx, &ic);
7528    end_divergent_if(ctx, &ic);
7529 }
7530
7531 } /* end namespace */
7532
7533 void handle_bc_optimize(isel_context *ctx)
7534 {
7535    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7536    Builder bld(ctx->program, ctx->block);
7537    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7538    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7539    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7540    if (uses_center && uses_centroid) {
7541       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u));
7542
7543       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7544          for (unsigned i = 0; i < 2; i++) {
7545             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7546                                       ctx->fs_inputs[fs_input::persp_centroid_p1 + i],
7547                                       ctx->fs_inputs[fs_input::persp_center_p1 + i],
7548                                       sel);
7549             ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord;
7550          }
7551       }
7552
7553       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7554          for (unsigned i = 0; i < 2; i++) {
7555             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7556                                       ctx->fs_inputs[fs_input::linear_centroid_p1 + i],
7557                                       ctx->fs_inputs[fs_input::linear_center_p1 + i],
7558                                       sel);
7559             ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord;
7560          }
7561       }
7562    }
7563 }
7564
7565 void select_program(Program *program,
7566                     unsigned shader_count,
7567                     struct nir_shader *const *shaders,
7568                     ac_shader_config* config,
7569                     struct radv_shader_info *info,
7570                     struct radv_nir_compiler_options *options)
7571 {
7572    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options);
7573
7574    for (unsigned i = 0; i < shader_count; i++) {
7575       nir_shader *nir = shaders[i];
7576       init_context(&ctx, nir);
7577
7578       if (!i) {
7579          add_startpgm(&ctx); /* needs to be after init_context() for FS */
7580          append_logical_start(ctx.block);
7581       }
7582
7583       if_context ic;
7584       if (shader_count >= 2) {
7585          Builder bld(ctx.program, ctx.block);
7586          Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7587          Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7588                                    bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7589          Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7590
7591          begin_divergent_if_then(&ctx, &ic, cond);
7592       }
7593
7594       if (i) {
7595          Builder bld(ctx.program, ctx.block);
7596          bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7597          bld.sopp(aco_opcode::s_barrier);
7598       }
7599
7600       if (ctx.stage == fragment_fs)
7601          handle_bc_optimize(&ctx);
7602
7603       nir_function_impl *func = nir_shader_get_entrypoint(nir);
7604       visit_cf_list(&ctx, &func->body);
7605
7606       if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7607          emit_streamout(&ctx, 0);
7608
7609       if (ctx.stage == vertex_vs)
7610          create_vs_exports(&ctx);
7611
7612       if (shader_count >= 2) {
7613          begin_divergent_if_else(&ctx, &ic);
7614          end_divergent_if(&ctx, &ic);
7615       }
7616
7617       ralloc_free(ctx.divergent_vals);
7618    }
7619
7620    append_logical_end(ctx.block);
7621    ctx.block->kind |= block_kind_uniform;
7622    Builder bld(ctx.program, ctx.block);
7623    if (ctx.program->wb_smem_l1_on_end)
7624       bld.smem(aco_opcode::s_dcache_wb, false);
7625    bld.sopp(aco_opcode::s_endpgm);
7626
7627    /* cleanup CFG */
7628    for (Block& BB : program->blocks) {
7629       for (unsigned idx : BB.linear_preds)
7630          program->blocks[idx].linear_succs.emplace_back(BB.index);
7631       for (unsigned idx : BB.logical_preds)
7632          program->blocks[idx].logical_succs.emplace_back(BB.index);
7633    }
7634 }
7635 }