src/amd/compiler/aco_instruction_selection.cpp

   1 /*
   2  * Copyright © 2018 Valve Corporation
   3  * Copyright © 2018 Google
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  22  * IN THE SOFTWARE.
  23  *
  24  */
  25
  26 #include <algorithm>
  27 #include <map>
  28
  29 #include "aco_ir.h"
  30 #include "aco_builder.h"
  31 #include "aco_interface.h"
  32 #include "aco_instruction_selection_setup.cpp"
  33 #include "util/fast_idiv_by_const.h"
  34
  35 namespace aco {
  36 namespace {
  37
  38 class loop_info_RAII {
  39    isel_context* ctx;
  40    unsigned header_idx_old;
  41    Block* exit_old;
  42    bool divergent_cont_old;
  43    bool divergent_branch_old;
  44    bool divergent_if_old;
  45
  46 public:
  47    loop_info_RAII(isel_context* ctx, unsigned loop_header_idx, Block* loop_exit)
  48       : ctx(ctx),
  49         header_idx_old(ctx->cf_info.parent_loop.header_idx), exit_old(ctx->cf_info.parent_loop.exit),
  50         divergent_cont_old(ctx->cf_info.parent_loop.has_divergent_continue),
  51         divergent_branch_old(ctx->cf_info.parent_loop.has_divergent_branch),
  52         divergent_if_old(ctx->cf_info.parent_if.is_divergent)
  53    {
  54       ctx->cf_info.parent_loop.header_idx = loop_header_idx;
  55       ctx->cf_info.parent_loop.exit = loop_exit;
  56       ctx->cf_info.parent_loop.has_divergent_continue = false;
  57       ctx->cf_info.parent_loop.has_divergent_branch = false;
  58       ctx->cf_info.parent_if.is_divergent = false;
  59       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
  60    }
  61
  62    ~loop_info_RAII()
  63    {
  64       ctx->cf_info.parent_loop.header_idx = header_idx_old;
  65       ctx->cf_info.parent_loop.exit = exit_old;
  66       ctx->cf_info.parent_loop.has_divergent_continue = divergent_cont_old;
  67       ctx->cf_info.parent_loop.has_divergent_branch = divergent_branch_old;
  68       ctx->cf_info.parent_if.is_divergent = divergent_if_old;
  69       ctx->cf_info.loop_nest_depth = ctx->cf_info.loop_nest_depth - 1;
  70       if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
  71          ctx->cf_info.exec_potentially_empty = false;
  72    }
  73 };
  74
  75 struct if_context {
  76    Temp cond;
  77
  78    bool divergent_old;
  79    bool exec_potentially_empty_old;
  80
  81    unsigned BB_if_idx;
  82    unsigned invert_idx;
  83    bool then_branch_divergent;
  84    Block BB_invert;
  85    Block BB_endif;
  86 };
  87
  88 static void visit_cf_list(struct isel_context *ctx,
  89                           struct exec_list *list);
  90
  91 static void add_logical_edge(unsigned pred_idx, Block *succ)
  92 {
  93    succ->logical_preds.emplace_back(pred_idx);
  94 }
  95
  96
  97 static void add_linear_edge(unsigned pred_idx, Block *succ)
  98 {
  99    succ->linear_preds.emplace_back(pred_idx);
 100 }
 101
 102 static void add_edge(unsigned pred_idx, Block *succ)
 103 {
 104    add_logical_edge(pred_idx, succ);
 105    add_linear_edge(pred_idx, succ);
 106 }
 107
 108 static void append_logical_start(Block *b)
 109 {
 110    Builder(NULL, b).pseudo(aco_opcode::p_logical_start);
 111 }
 112
 113 static void append_logical_end(Block *b)
 114 {
 115    Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
 116 }
 117
 118 Temp get_ssa_temp(struct isel_context *ctx, nir_ssa_def *def)
 119 {
 120    assert(ctx->allocated[def->index].id());
 121    return ctx->allocated[def->index];
 122 }
 123
 124 Temp emit_wqm(isel_context *ctx, Temp src, Temp dst=Temp(0, s1), bool program_needs_wqm = false)
 125 {
 126    Builder bld(ctx->program, ctx->block);
 127
 128    if (!dst.id())
 129       dst = bld.tmp(src.regClass());
 130
 131    if (ctx->stage != fragment_fs) {
 132       if (!dst.id())
 133          return src;
 134
 135       if (src.type() == RegType::vgpr || src.size() > 1)
 136          bld.copy(Definition(dst), src);
 137       else
 138          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 139       return dst;
 140    }
 141
 142    bld.pseudo(aco_opcode::p_wqm, Definition(dst), src);
 143    ctx->program->needs_wqm |= program_needs_wqm;
 144    return dst;
 145 }
 146
 147 Temp as_vgpr(isel_context *ctx, Temp val)
 148 {
 149    if (val.type() == RegType::sgpr) {
 150       Builder bld(ctx->program, ctx->block);
 151       return bld.copy(bld.def(RegType::vgpr, val.size()), val);
 152    }
 153    assert(val.type() == RegType::vgpr);
 154    return val;
 155 }
 156
 157 //assumes a != 0xffffffff
 158 void emit_v_div_u32(isel_context *ctx, Temp dst, Temp a, uint32_t b)
 159 {
 160    assert(b != 0);
 161    Builder bld(ctx->program, ctx->block);
 162
 163    if (util_is_power_of_two_or_zero(b)) {
 164       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)util_logbase2(b)), a);
 165       return;
 166    }
 167
 168    util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32);
 169
 170    assert(info.multiplier <= 0xffffffff);
 171
 172    bool pre_shift = info.pre_shift != 0;
 173    bool increment = info.increment != 0;
 174    bool multiply = true;
 175    bool post_shift = info.post_shift != 0;
 176
 177    if (!pre_shift && !increment && !multiply && !post_shift) {
 178       bld.vop1(aco_opcode::v_mov_b32, Definition(dst), a);
 179       return;
 180    }
 181
 182    Temp pre_shift_dst = a;
 183    if (pre_shift) {
 184       pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst;
 185       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand((uint32_t)info.pre_shift), a);
 186    }
 187
 188    Temp increment_dst = pre_shift_dst;
 189    if (increment) {
 190       increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst;
 191       bld.vadd32(Definition(increment_dst), Operand((uint32_t) info.increment), pre_shift_dst);
 192    }
 193
 194    Temp multiply_dst = increment_dst;
 195    if (multiply) {
 196       multiply_dst = post_shift ? bld.tmp(v1) : dst;
 197       bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst,
 198                bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand((uint32_t)info.multiplier)));
 199    }
 200
 201    if (post_shift) {
 202       bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand((uint32_t)info.post_shift), multiply_dst);
 203    }
 204 }
 205
 206 void emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst)
 207 {
 208    Builder bld(ctx->program, ctx->block);
 209    bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(idx));
 210 }
 211
 212
 213 Temp emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc)
 214 {
 215    /* no need to extract the whole vector */
 216    if (src.regClass() == dst_rc) {
 217       assert(idx == 0);
 218       return src;
 219    }
 220    assert(src.size() > idx);
 221    Builder bld(ctx->program, ctx->block);
 222    auto it = ctx->allocated_vec.find(src.id());
 223    /* the size check needs to be early because elements other than 0 may be garbage */
 224    if (it != ctx->allocated_vec.end() && it->second[0].size() == dst_rc.size()) {
 225       if (it->second[idx].regClass() == dst_rc) {
 226          return it->second[idx];
 227       } else {
 228          assert(dst_rc.size() == it->second[idx].regClass().size());
 229          assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr);
 230          return bld.copy(bld.def(dst_rc), it->second[idx]);
 231       }
 232    }
 233
 234    if (src.size() == dst_rc.size()) {
 235       assert(idx == 0);
 236       return bld.copy(bld.def(dst_rc), src);
 237    } else {
 238       Temp dst = bld.tmp(dst_rc);
 239       emit_extract_vector(ctx, src, idx, dst);
 240       return dst;
 241    }
 242 }
 243
 244 void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
 245 {
 246    if (num_components == 1)
 247       return;
 248    if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
 249       return;
 250    aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
 251    split->operands[0] = Operand(vec_src);
 252    std::array<Temp,4> elems;
 253    for (unsigned i = 0; i < num_components; i++) {
 254       elems[i] = {ctx->program->allocateId(), RegClass(vec_src.type(), vec_src.size() / num_components)};
 255       split->definitions[i] = Definition(elems[i]);
 256    }
 257    ctx->block->instructions.emplace_back(std::move(split));
 258    ctx->allocated_vec.emplace(vec_src.id(), elems);
 259 }
 260
 261 /* This vector expansion uses a mask to determine which elements in the new vector
 262  * come from the original vector. The other elements are undefined. */
 263 void expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
 264 {
 265    emit_split_vector(ctx, vec_src, util_bitcount(mask));
 266
 267    if (vec_src == dst)
 268       return;
 269
 270    Builder bld(ctx->program, ctx->block);
 271    if (num_components == 1) {
 272       if (dst.type() == RegType::sgpr)
 273          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
 274       else
 275          bld.copy(Definition(dst), vec_src);
 276       return;
 277    }
 278
 279    unsigned component_size = dst.size() / num_components;
 280    std::array<Temp,4> elems;
 281
 282    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
 283    vec->definitions[0] = Definition(dst);
 284    unsigned k = 0;
 285    for (unsigned i = 0; i < num_components; i++) {
 286       if (mask & (1 << i)) {
 287          Temp src = emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size));
 288          if (dst.type() == RegType::sgpr)
 289             src = bld.as_uniform(src);
 290          vec->operands[i] = Operand(src);
 291       } else {
 292          vec->operands[i] = Operand(0u);
 293       }
 294       elems[i] = vec->operands[i].getTemp();
 295    }
 296    ctx->block->instructions.emplace_back(std::move(vec));
 297    ctx->allocated_vec.emplace(dst.id(), elems);
 298 }
 299
 300 Temp as_divergent_bool(isel_context *ctx, Temp val, bool vcc_hint)
 301 {
 302    if (val.regClass() == s2) {
 303       return val;
 304    } else {
 305       assert(val.regClass() == s1);
 306       Builder bld(ctx->program, ctx->block);
 307       Definition& def = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2),
 308                                  Operand((uint32_t) -1), Operand(0u), bld.scc(val)).def(0);
 309       if (vcc_hint)
 310          def.setHint(vcc);
 311       return def.getTemp();
 312    }
 313 }
 314
 315 Temp as_uniform_bool(isel_context *ctx, Temp val)
 316 {
 317    if (val.regClass() == s1) {
 318       return val;
 319    } else {
 320       assert(val.regClass() == s2);
 321       Builder bld(ctx->program, ctx->block);
 322       return bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), Operand(0u), Operand(val));
 323    }
 324 }
 325
 326 Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
 327 {
 328    if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
 329       return get_ssa_temp(ctx, src.src.ssa);
 330
 331    if (src.src.ssa->num_components == size) {
 332       bool identity_swizzle = true;
 333       for (unsigned i = 0; identity_swizzle && i < size; i++) {
 334          if (src.swizzle[i] != i)
 335             identity_swizzle = false;
 336       }
 337       if (identity_swizzle)
 338          return get_ssa_temp(ctx, src.src.ssa);
 339    }
 340
 341    Temp vec = get_ssa_temp(ctx, src.src.ssa);
 342    unsigned elem_size = vec.size() / src.src.ssa->num_components;
 343    assert(elem_size > 0); /* TODO: 8 and 16-bit vectors not supported */
 344    assert(vec.size() % elem_size == 0);
 345
 346    RegClass elem_rc = RegClass(vec.type(), elem_size);
 347    if (size == 1) {
 348       return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc);
 349    } else {
 350       assert(size <= 4);
 351       std::array<Temp,4> elems;
 352       aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
 353       for (unsigned i = 0; i < size; ++i) {
 354          elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc);
 355          vec_instr->operands[i] = Operand{elems[i]};
 356       }
 357       Temp dst{ctx->program->allocateId(), RegClass(vec.type(), elem_size * size)};
 358       vec_instr->definitions[0] = Definition(dst);
 359       ctx->block->instructions.emplace_back(std::move(vec_instr));
 360       ctx->allocated_vec.emplace(dst.id(), elems);
 361       return dst;
 362    }
 363 }
 364
 365 Temp convert_pointer_to_64_bit(isel_context *ctx, Temp ptr)
 366 {
 367    if (ptr.size() == 2)
 368       return ptr;
 369    Builder bld(ctx->program, ctx->block);
 370    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s2),
 371                      ptr, Operand((unsigned)ctx->options->address32_hi));
 372 }
 373
 374 void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool writes_scc)
 375 {
 376    aco_ptr<SOP2_instruction> sop2{create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)};
 377    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
 378    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
 379    sop2->definitions[0] = Definition(dst);
 380    if (writes_scc)
 381       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
 382    ctx->block->instructions.emplace_back(std::move(sop2));
 383 }
 384
 385 void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst, bool commutative, bool swap_srcs=false)
 386 {
 387    Builder bld(ctx->program, ctx->block);
 388    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
 389    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
 390    if (src1.type() == RegType::sgpr) {
 391       if (commutative && src0.type() == RegType::vgpr) {
 392          Temp t = src0;
 393          src0 = src1;
 394          src1 = t;
 395       } else if (src0.type() == RegType::vgpr &&
 396                  op != aco_opcode::v_madmk_f32 &&
 397                  op != aco_opcode::v_madak_f32 &&
 398                  op != aco_opcode::v_madmk_f16 &&
 399                  op != aco_opcode::v_madak_f16) {
 400          /* If the instruction is not commutative, we emit a VOP3A instruction */
 401          bld.vop2_e64(op, Definition(dst), src0, src1);
 402          return;
 403       } else {
 404          src1 = bld.copy(bld.def(RegType::vgpr, src1.size()), src1); //TODO: as_vgpr
 405       }
 406    }
 407    bld.vop2(op, Definition(dst), src0, src1);
 408 }
 409
 410 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 411 {
 412    Temp src0 = get_alu_src(ctx, instr->src[0]);
 413    Temp src1 = get_alu_src(ctx, instr->src[1]);
 414    Temp src2 = get_alu_src(ctx, instr->src[2]);
 415
 416    /* ensure that the instruction has at most 1 sgpr operand
 417     * The optimizer will inline constants for us */
 418    if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr)
 419       src0 = as_vgpr(ctx, src0);
 420    if (src1.type() == RegType::sgpr && src2.type() == RegType::sgpr)
 421       src1 = as_vgpr(ctx, src1);
 422    if (src2.type() == RegType::sgpr && src0.type() == RegType::sgpr)
 423       src2 = as_vgpr(ctx, src2);
 424
 425    Builder bld(ctx->program, ctx->block);
 426    bld.vop3(op, Definition(dst), src0, src1, src2);
 427 }
 428
 429 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 430 {
 431    Builder bld(ctx->program, ctx->block);
 432    bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 433 }
 434
 435 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 436 {
 437    Temp src0 = get_alu_src(ctx, instr->src[0]);
 438    Temp src1 = get_alu_src(ctx, instr->src[1]);
 439    aco_ptr<Instruction> vopc;
 440    if (src1.type() == RegType::sgpr) {
 441       if (src0.type() == RegType::vgpr) {
 442          /* to swap the operands, we might also have to change the opcode */
 443          switch (op) {
 444             case aco_opcode::v_cmp_lt_f32:
 445                op = aco_opcode::v_cmp_gt_f32;
 446                break;
 447             case aco_opcode::v_cmp_ge_f32:
 448                op = aco_opcode::v_cmp_le_f32;
 449                break;
 450             case aco_opcode::v_cmp_lt_i32:
 451                op = aco_opcode::v_cmp_gt_i32;
 452                break;
 453             case aco_opcode::v_cmp_ge_i32:
 454                op = aco_opcode::v_cmp_le_i32;
 455                break;
 456             case aco_opcode::v_cmp_lt_u32:
 457                op = aco_opcode::v_cmp_gt_u32;
 458                break;
 459             case aco_opcode::v_cmp_ge_u32:
 460                op = aco_opcode::v_cmp_le_u32;
 461                break;
 462             case aco_opcode::v_cmp_lt_f64:
 463                op = aco_opcode::v_cmp_gt_f64;
 464                break;
 465             case aco_opcode::v_cmp_ge_f64:
 466                op = aco_opcode::v_cmp_le_f64;
 467                break;
 468             case aco_opcode::v_cmp_lt_i64:
 469                op = aco_opcode::v_cmp_gt_i64;
 470                break;
 471             case aco_opcode::v_cmp_ge_i64:
 472                op = aco_opcode::v_cmp_le_i64;
 473                break;
 474             case aco_opcode::v_cmp_lt_u64:
 475                op = aco_opcode::v_cmp_gt_u64;
 476                break;
 477             case aco_opcode::v_cmp_ge_u64:
 478                op = aco_opcode::v_cmp_le_u64;
 479                break;
 480             default: /* eq and ne are commutative */
 481                break;
 482          }
 483          Temp t = src0;
 484          src0 = src1;
 485          src1 = t;
 486       } else {
 487          src1 = as_vgpr(ctx, src1);
 488       }
 489    }
 490    Builder bld(ctx->program, ctx->block);
 491    bld.vopc(op, Definition(dst), src0, src1).def(0).setHint(vcc);
 492 }
 493
 494 void emit_comparison(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 495 {
 496    if (dst.regClass() == s2) {
 497       emit_vopc_instruction(ctx, instr, op, dst);
 498       if (!ctx->divergent_vals[instr->dest.dest.ssa.index])
 499          emit_split_vector(ctx, dst, 2);
 500    } else if (dst.regClass() == s1) {
 501       Temp src0 = get_alu_src(ctx, instr->src[0]);
 502       Temp src1 = get_alu_src(ctx, instr->src[1]);
 503       assert(src0.type() == RegType::sgpr && src1.type() == RegType::sgpr);
 504
 505       Builder bld(ctx->program, ctx->block);
 506       bld.sopc(op, bld.scc(Definition(dst)), src0, src1);
 507
 508    } else {
 509       assert(false);
 510    }
 511 }
 512
 513 void emit_boolean_logic(isel_context *ctx, nir_alu_instr *instr, aco_opcode op32, aco_opcode op64, Temp dst)
 514 {
 515    Builder bld(ctx->program, ctx->block);
 516    Temp src0 = get_alu_src(ctx, instr->src[0]);
 517    Temp src1 = get_alu_src(ctx, instr->src[1]);
 518    if (dst.regClass() == s2) {
 519       bld.sop2(op64, Definition(dst), bld.def(s1, scc),
 520                as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
 521    } else {
 522       assert(dst.regClass() == s1);
 523       bld.sop2(op32, bld.def(s1), bld.scc(Definition(dst)),
 524                as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
 525    }
 526 }
 527
 528
 529 void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 530 {
 531    Builder bld(ctx->program, ctx->block);
 532    Temp cond = get_alu_src(ctx, instr->src[0]);
 533    Temp then = get_alu_src(ctx, instr->src[1]);
 534    Temp els = get_alu_src(ctx, instr->src[2]);
 535
 536    if (dst.type() == RegType::vgpr) {
 537       cond = as_divergent_bool(ctx, cond, true);
 538
 539       aco_ptr<Instruction> bcsel;
 540       if (dst.size() == 1) {
 541          then = as_vgpr(ctx, then);
 542          els = as_vgpr(ctx, els);
 543
 544          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
 545       } else if (dst.size() == 2) {
 546          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
 547          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
 548          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
 549          bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els);
 550
 551          Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond);
 552          Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond);
 553
 554          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 555       } else {
 556          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 557          nir_print_instr(&instr->instr, stderr);
 558          fprintf(stderr, "\n");
 559       }
 560       return;
 561    }
 562
 563    if (instr->dest.dest.ssa.bit_size != 1) { /* uniform condition and values in sgpr */
 564       if (dst.regClass() == s1 || dst.regClass() == s2) {
 565          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
 566          aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
 567          bld.sop2(op, Definition(dst), then, els, bld.scc(as_uniform_bool(ctx, cond)));
 568       } else {
 569          fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
 570          nir_print_instr(&instr->instr, stderr);
 571          fprintf(stderr, "\n");
 572       }
 573       return;
 574    }
 575
 576    /* boolean bcsel */
 577    assert(instr->dest.dest.ssa.bit_size == 1);
 578
 579    if (dst.regClass() == s1)
 580       cond = as_uniform_bool(ctx, cond);
 581
 582    if (cond.regClass() == s1) { /* uniform selection */
 583       aco_opcode op;
 584       if (dst.regClass() == s2) {
 585          op = aco_opcode::s_cselect_b64;
 586          then = as_divergent_bool(ctx, then, false);
 587          els = as_divergent_bool(ctx, els, false);
 588       } else {
 589          assert(dst.regClass() == s1);
 590          op = aco_opcode::s_cselect_b32;
 591          then = as_uniform_bool(ctx, then);
 592          els = as_uniform_bool(ctx, els);
 593       }
 594       bld.sop2(op, Definition(dst), then, els, bld.scc(cond));
 595       return;
 596    }
 597
 598    /* divergent boolean bcsel
 599     * this implements bcsel on bools: dst = s0 ? s1 : s2
 600     * are going to be: dst = (s0 & s1) | (~s0 & s2) */
 601    assert (dst.regClass() == s2);
 602    then = as_divergent_bool(ctx, then, false);
 603    els = as_divergent_bool(ctx, els, false);
 604
 605    if (cond.id() != then.id())
 606       then = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), cond, then);
 607
 608    if (cond.id() == els.id())
 609       bld.sop1(aco_opcode::s_mov_b64, Definition(dst), then);
 610    else
 611       bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), then,
 612                bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), els, cond));
 613 }
 614
 615 void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
 616 {
 617    if (!instr->dest.dest.is_ssa) {
 618       fprintf(stderr, "nir alu dst not in ssa: ");
 619       nir_print_instr(&instr->instr, stderr);
 620       fprintf(stderr, "\n");
 621       abort();
 622    }
 623    Builder bld(ctx->program, ctx->block);
 624    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
 625    switch(instr->op) {
 626    case nir_op_vec2:
 627    case nir_op_vec3:
 628    case nir_op_vec4: {
 629       std::array<Temp,4> elems;
 630       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
 631       for (unsigned i = 0; i < instr->dest.dest.ssa.num_components; ++i) {
 632          elems[i] = get_alu_src(ctx, instr->src[i]);
 633          vec->operands[i] = Operand{elems[i]};
 634       }
 635       vec->definitions[0] = Definition(dst);
 636       ctx->block->instructions.emplace_back(std::move(vec));
 637       ctx->allocated_vec.emplace(dst.id(), elems);
 638       break;
 639    }
 640    case nir_op_mov: {
 641       Temp src = get_alu_src(ctx, instr->src[0]);
 642       aco_ptr<Instruction> mov;
 643       if (dst.type() == RegType::sgpr) {
 644          if (src.type() == RegType::vgpr)
 645             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
 646          else if (src.regClass() == s1)
 647             bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
 648          else if (src.regClass() == s2)
 649             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
 650          else
 651             unreachable("wrong src register class for nir_op_imov");
 652       } else if (dst.regClass() == v1) {
 653          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
 654       } else if (dst.regClass() == v2) {
 655          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
 656       } else {
 657          nir_print_instr(&instr->instr, stderr);
 658          unreachable("Should have been lowered to scalar.");
 659       }
 660       break;
 661    }
 662    case nir_op_inot: {
 663       Temp src = get_alu_src(ctx, instr->src[0]);
 664       /* uniform booleans */
 665       if (instr->dest.dest.ssa.bit_size == 1 && dst.regClass() == s1) {
 666          if (src.regClass() == s1) {
 667             /* in this case, src is either 1 or 0 */
 668             bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.scc(Definition(dst)), Operand(1u), src);
 669          } else {
 670             /* src is either exec_mask or 0 */
 671             assert(src.regClass() == s2);
 672             bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(Definition(dst)), Operand(0u), src);
 673          }
 674       } else if (dst.regClass() == v1) {
 675          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
 676       } else if (dst.type() == RegType::sgpr) {
 677          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
 678          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
 679       } else {
 680          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 681          nir_print_instr(&instr->instr, stderr);
 682          fprintf(stderr, "\n");
 683       }
 684       break;
 685    }
 686    case nir_op_ineg: {
 687       Temp src = get_alu_src(ctx, instr->src[0]);
 688       if (dst.regClass() == v1) {
 689          bld.vsub32(Definition(dst), Operand(0u), Operand(src));
 690       } else if (dst.regClass() == s1) {
 691          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand((uint32_t) -1), src);
 692       } else {
 693          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 694          nir_print_instr(&instr->instr, stderr);
 695          fprintf(stderr, "\n");
 696       }
 697       break;
 698    }
 699    case nir_op_iabs: {
 700       if (dst.regClass() == s1) {
 701          bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), get_alu_src(ctx, instr->src[0]));
 702       } else if (dst.regClass() == v1) {
 703          Temp src = get_alu_src(ctx, instr->src[0]);
 704          bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
 705       } else {
 706          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 707          nir_print_instr(&instr->instr, stderr);
 708          fprintf(stderr, "\n");
 709       }
 710       break;
 711    }
 712    case nir_op_isign: {
 713       Temp src = get_alu_src(ctx, instr->src[0]);
 714       if (dst.regClass() == s1) {
 715          Temp tmp = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
 716          Temp gtz = bld.sopc(aco_opcode::s_cmp_gt_i32, bld.def(s1, scc), src, Operand(0u));
 717          bld.sop2(aco_opcode::s_add_i32, Definition(dst), bld.def(s1, scc), gtz, tmp);
 718       } else if (dst.regClass() == s2) {
 719          Temp neg = bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand(63u));
 720          Temp neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand(0u));
 721          bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, neqz);
 722       } else if (dst.regClass() == v1) {
 723          Temp tmp = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
 724          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 725          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(1u), tmp, gtz);
 726       } else if (dst.regClass() == v2) {
 727          Temp upper = emit_extract_vector(ctx, src, 1, v1);
 728          Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), upper);
 729          Temp gtz = bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
 730          Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(1u), neg, gtz);
 731          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
 732          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
 733       } else {
 734          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 735          nir_print_instr(&instr->instr, stderr);
 736          fprintf(stderr, "\n");
 737       }
 738       break;
 739    }
 740    case nir_op_imax: {
 741       if (dst.regClass() == v1) {
 742          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true);
 743       } else if (dst.regClass() == s1) {
 744          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
 745       } else {
 746          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 747          nir_print_instr(&instr->instr, stderr);
 748          fprintf(stderr, "\n");
 749       }
 750       break;
 751    }
 752    case nir_op_umax: {
 753       if (dst.regClass() == v1) {
 754          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true);
 755       } else if (dst.regClass() == s1) {
 756          emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
 757       } else {
 758          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 759          nir_print_instr(&instr->instr, stderr);
 760          fprintf(stderr, "\n");
 761       }
 762       break;
 763    }
 764    case nir_op_imin: {
 765       if (dst.regClass() == v1) {
 766          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true);
 767       } else if (dst.regClass() == s1) {
 768          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
 769       } else {
 770          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 771          nir_print_instr(&instr->instr, stderr);
 772          fprintf(stderr, "\n");
 773       }
 774       break;
 775    }
 776    case nir_op_umin: {
 777       if (dst.regClass() == v1) {
 778          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true);
 779       } else if (dst.regClass() == s1) {
 780          emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
 781       } else {
 782          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 783          nir_print_instr(&instr->instr, stderr);
 784          fprintf(stderr, "\n");
 785       }
 786       break;
 787    }
 788    case nir_op_ior: {
 789       if (instr->dest.dest.ssa.bit_size == 1) {
 790          emit_boolean_logic(ctx, instr, aco_opcode::s_or_b32, aco_opcode::s_or_b64, dst);
 791       } else if (dst.regClass() == v1) {
 792          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
 793       } else if (dst.regClass() == s1) {
 794          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
 795       } else if (dst.regClass() == s2) {
 796          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
 797       } else {
 798          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 799          nir_print_instr(&instr->instr, stderr);
 800          fprintf(stderr, "\n");
 801       }
 802       break;
 803    }
 804    case nir_op_iand: {
 805       if (instr->dest.dest.ssa.bit_size == 1) {
 806          emit_boolean_logic(ctx, instr, aco_opcode::s_and_b32, aco_opcode::s_and_b64, dst);
 807       } else if (dst.regClass() == v1) {
 808          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
 809       } else if (dst.regClass() == s1) {
 810          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
 811       } else if (dst.regClass() == s2) {
 812          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
 813       } else {
 814          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 815          nir_print_instr(&instr->instr, stderr);
 816          fprintf(stderr, "\n");
 817       }
 818       break;
 819    }
 820    case nir_op_ixor: {
 821       if (instr->dest.dest.ssa.bit_size == 1) {
 822          emit_boolean_logic(ctx, instr, aco_opcode::s_xor_b32, aco_opcode::s_xor_b64, dst);
 823       } else if (dst.regClass() == v1) {
 824          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
 825       } else if (dst.regClass() == s1) {
 826          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
 827       } else if (dst.regClass() == s2) {
 828          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
 829       } else {
 830          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 831          nir_print_instr(&instr->instr, stderr);
 832          fprintf(stderr, "\n");
 833       }
 834       break;
 835    }
 836    case nir_op_ushr: {
 837       if (dst.regClass() == v1) {
 838          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true);
 839       } else if (dst.regClass() == v2) {
 840          bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst),
 841                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 842       } else if (dst.regClass() == s2) {
 843          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true);
 844       } else if (dst.regClass() == s1) {
 845          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
 846       } else {
 847          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 848          nir_print_instr(&instr->instr, stderr);
 849          fprintf(stderr, "\n");
 850       }
 851       break;
 852    }
 853    case nir_op_ishl: {
 854       if (dst.regClass() == v1) {
 855          emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true);
 856       } else if (dst.regClass() == v2) {
 857          bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst),
 858                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 859       } else if (dst.regClass() == s1) {
 860          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true);
 861       } else if (dst.regClass() == s2) {
 862          emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
 863       } else {
 864          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 865          nir_print_instr(&instr->instr, stderr);
 866          fprintf(stderr, "\n");
 867       }
 868       break;
 869    }
 870    case nir_op_ishr: {
 871       if (dst.regClass() == v1) {
 872          emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true);
 873       } else if (dst.regClass() == v2) {
 874          bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst),
 875                   get_alu_src(ctx, instr->src[1]), get_alu_src(ctx, instr->src[0]));
 876       } else if (dst.regClass() == s1) {
 877          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true);
 878       } else if (dst.regClass() == s2) {
 879          emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
 880       } else {
 881          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 882          nir_print_instr(&instr->instr, stderr);
 883          fprintf(stderr, "\n");
 884       }
 885       break;
 886    }
 887    case nir_op_find_lsb: {
 888       Temp src = get_alu_src(ctx, instr->src[0]);
 889       if (src.regClass() == s1) {
 890          bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src);
 891       } else if (src.regClass() == v1) {
 892          emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst);
 893       } else if (src.regClass() == s2) {
 894          bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
 895       } else {
 896          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 897          nir_print_instr(&instr->instr, stderr);
 898          fprintf(stderr, "\n");
 899       }
 900       break;
 901    }
 902    case nir_op_ufind_msb:
 903    case nir_op_ifind_msb: {
 904       Temp src = get_alu_src(ctx, instr->src[0]);
 905       if (src.regClass() == s1 || src.regClass() == s2) {
 906          aco_opcode op = src.regClass() == s2 ?
 907                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 : aco_opcode::s_flbit_i32_i64) :
 908                          (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 : aco_opcode::s_flbit_i32);
 909          Temp msb_rev = bld.sop1(op, bld.def(s1), src);
 910
 911          Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc),
 912                                         Operand(src.size() * 32u - 1u), msb_rev);
 913          Temp msb = sub.def(0).getTemp();
 914          Temp carry = sub.def(1).getTemp();
 915
 916          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t)-1), msb, carry);
 917       } else if (src.regClass() == v1) {
 918          aco_opcode op = instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32;
 919          Temp msb_rev = bld.tmp(v1);
 920          emit_vop1_instruction(ctx, instr, op, msb_rev);
 921          Temp msb = bld.tmp(v1);
 922          Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
 923          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
 924       } else {
 925          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 926          nir_print_instr(&instr->instr, stderr);
 927          fprintf(stderr, "\n");
 928       }
 929       break;
 930    }
 931    case nir_op_bitfield_reverse: {
 932       if (dst.regClass() == s1) {
 933          bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 934       } else if (dst.regClass() == v1) {
 935          bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
 936       } else {
 937          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 938          nir_print_instr(&instr->instr, stderr);
 939          fprintf(stderr, "\n");
 940       }
 941       break;
 942    }
 943    case nir_op_iadd: {
 944       if (dst.regClass() == s1) {
 945          emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true);
 946          break;
 947       }
 948
 949       Temp src0 = get_alu_src(ctx, instr->src[0]);
 950       Temp src1 = get_alu_src(ctx, instr->src[1]);
 951       if (dst.regClass() == v1) {
 952          bld.vadd32(Definition(dst), Operand(src0), Operand(src1));
 953          break;
 954       }
 955
 956       assert(src0.size() == 2 && src1.size() == 2);
 957       Temp src00 = bld.tmp(src0.type(), 1);
 958       Temp src01 = bld.tmp(dst.type(), 1);
 959       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
 960       Temp src10 = bld.tmp(src1.type(), 1);
 961       Temp src11 = bld.tmp(dst.type(), 1);
 962       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
 963
 964       if (dst.regClass() == s2) {
 965          Temp carry = bld.tmp(s1);
 966          Temp dst0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
 967          Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, bld.scc(carry));
 968          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 969       } else if (dst.regClass() == v2) {
 970          Temp dst0 = bld.tmp(v1);
 971          Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp();
 972          Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
 973          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
 974       } else {
 975          fprintf(stderr, "Unimplemented NIR instr bit size: ");
 976          nir_print_instr(&instr->instr, stderr);
 977          fprintf(stderr, "\n");
 978       }
 979       break;
 980    }
 981    case nir_op_uadd_sat: {
 982       Temp src0 = get_alu_src(ctx, instr->src[0]);
 983       Temp src1 = get_alu_src(ctx, instr->src[1]);
 984       if (dst.regClass() == s1) {
 985          Temp tmp = bld.tmp(s1), carry = bld.tmp(s1);
 986          bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)),
 987                   src0, src1);
 988          bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand((uint32_t) -1), tmp, bld.scc(carry));
 989       } else if (dst.regClass() == v1) {
 990          if (ctx->options->chip_class >= GFX9) {
 991             aco_ptr<VOP3A_instruction> add{create_instruction<VOP3A_instruction>(aco_opcode::v_add_u32, asVOP3(Format::VOP2), 2, 1)};
 992             add->operands[0] = Operand(src0);
 993             add->operands[1] = Operand(src1);
 994             add->definitions[0] = Definition(dst);
 995             add->clamp = 1;
 996             ctx->block->instructions.emplace_back(std::move(add));
 997          } else {
 998             if (src1.regClass() != v1)
 999                std::swap(src0, src1);
1000             assert(src1.regClass() == v1);
1001             Temp tmp = bld.tmp(v1);
1002             Temp carry = bld.vadd32(Definition(tmp), src0, src1, true).def(1).getTemp();
1003             bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
1004          }
1005       } else {
1006          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1007          nir_print_instr(&instr->instr, stderr);
1008          fprintf(stderr, "\n");
1009       }
1010       break;
1011    }
1012    case nir_op_uadd_carry: {
1013       Temp src0 = get_alu_src(ctx, instr->src[0]);
1014       Temp src1 = get_alu_src(ctx, instr->src[1]);
1015       if (dst.regClass() == s1) {
1016          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1017          break;
1018       }
1019       if (dst.regClass() == v1) {
1020          Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp();
1021          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), carry);
1022          break;
1023       }
1024
1025       Temp src00 = bld.tmp(src0.type(), 1);
1026       Temp src01 = bld.tmp(dst.type(), 1);
1027       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1028       Temp src10 = bld.tmp(src1.type(), 1);
1029       Temp src11 = bld.tmp(dst.type(), 1);
1030       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1031       if (dst.regClass() == s2) {
1032          Temp carry = bld.tmp(s1);
1033          bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1034          carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(carry)).def(1).getTemp();
1035          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1036       } else if (dst.regClass() == v2) {
1037          Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp();
1038          carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp();
1039          carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
1040          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
1041       } else {
1042          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1043          nir_print_instr(&instr->instr, stderr);
1044          fprintf(stderr, "\n");
1045       }
1046       break;
1047    }
1048    case nir_op_isub: {
1049       if (dst.regClass() == s1) {
1050          emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true);
1051          break;
1052       }
1053
1054       Temp src0 = get_alu_src(ctx, instr->src[0]);
1055       Temp src1 = get_alu_src(ctx, instr->src[1]);
1056       if (dst.regClass() == v1) {
1057          bld.vsub32(Definition(dst), src0, src1);
1058          break;
1059       }
1060
1061       Temp src00 = bld.tmp(src0.type(), 1);
1062       Temp src01 = bld.tmp(dst.type(), 1);
1063       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1064       Temp src10 = bld.tmp(src1.type(), 1);
1065       Temp src11 = bld.tmp(dst.type(), 1);
1066       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1067       if (dst.regClass() == s2) {
1068          Temp carry = bld.tmp(s1);
1069          Temp dst0 = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10);
1070          Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, carry);
1071          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
1072       } else if (dst.regClass() == v2) {
1073          Temp lower = bld.tmp(v1);
1074          Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp();
1075          Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
1076          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1077       } else {
1078          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1079          nir_print_instr(&instr->instr, stderr);
1080          fprintf(stderr, "\n");
1081       }
1082       break;
1083    }
1084    case nir_op_usub_borrow: {
1085       Temp src0 = get_alu_src(ctx, instr->src[0]);
1086       Temp src1 = get_alu_src(ctx, instr->src[1]);
1087       if (dst.regClass() == s1) {
1088          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1);
1089          break;
1090       } else if (dst.regClass() == v1) {
1091          Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp();
1092          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), borrow);
1093          break;
1094       }
1095
1096       Temp src00 = bld.tmp(src0.type(), 1);
1097       Temp src01 = bld.tmp(dst.type(), 1);
1098       bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
1099       Temp src10 = bld.tmp(src1.type(), 1);
1100       Temp src11 = bld.tmp(dst.type(), 1);
1101       bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
1102       if (dst.regClass() == s2) {
1103          Temp borrow = bld.tmp(s1);
1104          bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10);
1105          borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, bld.scc(borrow)).def(1).getTemp();
1106          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1107       } else if (dst.regClass() == v2) {
1108          Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp();
1109          borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp();
1110          borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
1111          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
1112       } else {
1113          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1114          nir_print_instr(&instr->instr, stderr);
1115          fprintf(stderr, "\n");
1116       }
1117       break;
1118    }
1119    case nir_op_imul: {
1120       if (dst.regClass() == v1) {
1121          bld.vop3(aco_opcode::v_mul_lo_u32, Definition(dst),
1122                   get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1123       } else if (dst.regClass() == s1) {
1124          emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
1125       } else {
1126          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1127          nir_print_instr(&instr->instr, stderr);
1128          fprintf(stderr, "\n");
1129       }
1130       break;
1131    }
1132    case nir_op_umul_high: {
1133       if (dst.regClass() == v1) {
1134          bld.vop3(aco_opcode::v_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1135       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1136          bld.sop2(aco_opcode::s_mul_hi_u32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1137       } else if (dst.regClass() == s1) {
1138          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1139                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1140          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1141       } else {
1142          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1143          nir_print_instr(&instr->instr, stderr);
1144          fprintf(stderr, "\n");
1145       }
1146       break;
1147    }
1148    case nir_op_imul_high: {
1149       if (dst.regClass() == v1) {
1150          bld.vop3(aco_opcode::v_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1151       } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) {
1152          bld.sop2(aco_opcode::s_mul_hi_i32, Definition(dst), get_alu_src(ctx, instr->src[0]), get_alu_src(ctx, instr->src[1]));
1153       } else if (dst.regClass() == s1) {
1154          Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]),
1155                              as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1156          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
1157       } else {
1158          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1159          nir_print_instr(&instr->instr, stderr);
1160          fprintf(stderr, "\n");
1161       }
1162       break;
1163    }
1164    case nir_op_fmul: {
1165       if (dst.size() == 1) {
1166          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
1167       } else if (dst.size() == 2) {
1168          bld.vop3(aco_opcode::v_mul_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1169                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1170       } else {
1171          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1172          nir_print_instr(&instr->instr, stderr);
1173          fprintf(stderr, "\n");
1174       }
1175       break;
1176    }
1177    case nir_op_fadd: {
1178       if (dst.size() == 1) {
1179          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
1180       } else if (dst.size() == 2) {
1181          bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]),
1182                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1183       } else {
1184          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1185          nir_print_instr(&instr->instr, stderr);
1186          fprintf(stderr, "\n");
1187       }
1188       break;
1189    }
1190    case nir_op_fsub: {
1191       Temp src0 = get_alu_src(ctx, instr->src[0]);
1192       Temp src1 = get_alu_src(ctx, instr->src[1]);
1193       if (dst.size() == 1) {
1194          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
1195             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
1196          else
1197             emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true);
1198       } else if (dst.size() == 2) {
1199          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst),
1200                                      get_alu_src(ctx, instr->src[0]),
1201                                      as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1202          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1203          sub->neg[1] = true;
1204       } else {
1205          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1206          nir_print_instr(&instr->instr, stderr);
1207          fprintf(stderr, "\n");
1208       }
1209       break;
1210    }
1211    case nir_op_fmod:
1212    case nir_op_frem: {
1213       if (dst.size() == 1) {
1214          Temp rcp = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_alu_src(ctx, instr->src[1]));
1215          Temp mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]), rcp);
1216
1217          aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f32 : aco_opcode::v_trunc_f32;
1218          Temp floor = bld.vop1(op, bld.def(v1), mul);
1219
1220          mul = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), get_alu_src(ctx, instr->src[1]), floor);
1221          bld.vop2(aco_opcode::v_sub_f32, Definition(dst), get_alu_src(ctx, instr->src[0]), mul);
1222       } else if (dst.size() == 2) {
1223          Temp rcp = bld.vop1(aco_opcode::v_rcp_f64, bld.def(v2), get_alu_src(ctx, instr->src[1]));
1224          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[0]), rcp);
1225
1226          aco_opcode op = instr->op == nir_op_fmod ? aco_opcode::v_floor_f64 : aco_opcode::v_trunc_f64;
1227          Temp floor = bld.vop1(op, bld.def(v1), mul);
1228
1229          mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), get_alu_src(ctx, instr->src[1]), floor);
1230          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), get_alu_src(ctx, instr->src[0]), mul);
1231          VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
1232          sub->neg[1] = true;
1233       } else {
1234          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1235          nir_print_instr(&instr->instr, stderr);
1236          fprintf(stderr, "\n");
1237       }
1238       break;
1239    }
1240    case nir_op_fmax: {
1241       if (dst.size() == 1) {
1242          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true);
1243       } else if (dst.size() == 2) {
1244          bld.vop3(aco_opcode::v_max_f64, Definition(dst),
1245                   get_alu_src(ctx, instr->src[0]),
1246                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1247       } else {
1248          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1249          nir_print_instr(&instr->instr, stderr);
1250          fprintf(stderr, "\n");
1251       }
1252       break;
1253    }
1254    case nir_op_fmin: {
1255       if (dst.size() == 1) {
1256          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true);
1257       } else if (dst.size() == 2) {
1258          bld.vop3(aco_opcode::v_min_f64, Definition(dst),
1259                   get_alu_src(ctx, instr->src[0]),
1260                   as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
1261       } else {
1262          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1263          nir_print_instr(&instr->instr, stderr);
1264          fprintf(stderr, "\n");
1265       }
1266       break;
1267    }
1268    case nir_op_fmax3: {
1269       if (dst.size() == 1) {
1270          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst);
1271       } else {
1272          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1273          nir_print_instr(&instr->instr, stderr);
1274          fprintf(stderr, "\n");
1275       }
1276       break;
1277    }
1278    case nir_op_fmin3: {
1279       if (dst.size() == 1) {
1280          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst);
1281       } else {
1282          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1283          nir_print_instr(&instr->instr, stderr);
1284          fprintf(stderr, "\n");
1285       }
1286       break;
1287    }
1288    case nir_op_fmed3: {
1289       if (dst.size() == 1) {
1290          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst);
1291       } else {
1292          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1293          nir_print_instr(&instr->instr, stderr);
1294          fprintf(stderr, "\n");
1295       }
1296       break;
1297    }
1298    case nir_op_umax3: {
1299       if (dst.size() == 1) {
1300          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
1301       } else {
1302          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1303          nir_print_instr(&instr->instr, stderr);
1304          fprintf(stderr, "\n");
1305       }
1306       break;
1307    }
1308    case nir_op_umin3: {
1309       if (dst.size() == 1) {
1310          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
1311       } else {
1312          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1313          nir_print_instr(&instr->instr, stderr);
1314          fprintf(stderr, "\n");
1315       }
1316       break;
1317    }
1318    case nir_op_umed3: {
1319       if (dst.size() == 1) {
1320          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
1321       } else {
1322          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1323          nir_print_instr(&instr->instr, stderr);
1324          fprintf(stderr, "\n");
1325       }
1326       break;
1327    }
1328    case nir_op_imax3: {
1329       if (dst.size() == 1) {
1330          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
1331       } else {
1332          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1333          nir_print_instr(&instr->instr, stderr);
1334          fprintf(stderr, "\n");
1335       }
1336       break;
1337    }
1338    case nir_op_imin3: {
1339       if (dst.size() == 1) {
1340          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
1341       } else {
1342          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1343          nir_print_instr(&instr->instr, stderr);
1344          fprintf(stderr, "\n");
1345       }
1346       break;
1347    }
1348    case nir_op_imed3: {
1349       if (dst.size() == 1) {
1350          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
1351       } else {
1352          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1353          nir_print_instr(&instr->instr, stderr);
1354          fprintf(stderr, "\n");
1355       }
1356       break;
1357    }
1358    case nir_op_cube_face_coord: {
1359       Temp in = get_alu_src(ctx, instr->src[0], 3);
1360       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1361                       emit_extract_vector(ctx, in, 1, v1),
1362                       emit_extract_vector(ctx, in, 2, v1) };
1363       Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]);
1364       ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
1365       Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
1366       Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
1367       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
1368       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
1369       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
1370       break;
1371    }
1372    case nir_op_cube_face_index: {
1373       Temp in = get_alu_src(ctx, instr->src[0], 3);
1374       Temp src[3] = { emit_extract_vector(ctx, in, 0, v1),
1375                       emit_extract_vector(ctx, in, 1, v1),
1376                       emit_extract_vector(ctx, in, 2, v1) };
1377       bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]);
1378       break;
1379    }
1380    case nir_op_bcsel: {
1381       emit_bcsel(ctx, instr, dst);
1382       break;
1383    }
1384    case nir_op_frsq: {
1385       if (dst.size() == 1) {
1386          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f32, dst);
1387       } else if (dst.size() == 2) {
1388          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
1389       } else {
1390          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1391          nir_print_instr(&instr->instr, stderr);
1392          fprintf(stderr, "\n");
1393       }
1394       break;
1395    }
1396    case nir_op_fneg: {
1397       Temp src = get_alu_src(ctx, instr->src[0]);
1398       if (dst.size() == 1) {
1399          bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x80000000u), as_vgpr(ctx, src));
1400       } else if (dst.size() == 2) {
1401          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1402          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1403          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
1404          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1405       } else {
1406          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1407          nir_print_instr(&instr->instr, stderr);
1408          fprintf(stderr, "\n");
1409       }
1410       break;
1411    }
1412    case nir_op_fabs: {
1413       Temp src = get_alu_src(ctx, instr->src[0]);
1414       if (dst.size() == 1) {
1415          bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFFFFFu), as_vgpr(ctx, src));
1416       } else if (dst.size() == 2) {
1417          Temp upper = bld.tmp(v1), lower = bld.tmp(v1);
1418          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1419          upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
1420          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1421       } else {
1422          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1423          nir_print_instr(&instr->instr, stderr);
1424          fprintf(stderr, "\n");
1425       }
1426       break;
1427    }
1428    case nir_op_fsat: {
1429       Temp src = get_alu_src(ctx, instr->src[0]);
1430       if (dst.size() == 1) {
1431          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
1432       } else if (dst.size() == 2) {
1433          Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand(0u));
1434          VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
1435          vop3->clamp = true;
1436       } else {
1437          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1438          nir_print_instr(&instr->instr, stderr);
1439          fprintf(stderr, "\n");
1440       }
1441       break;
1442    }
1443    case nir_op_flog2: {
1444       if (dst.size() == 1) {
1445          emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f32, dst);
1446       } else {
1447          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1448          nir_print_instr(&instr->instr, stderr);
1449          fprintf(stderr, "\n");
1450       }
1451       break;
1452    }
1453    case nir_op_frcp: {
1454       if (dst.size() == 1) {
1455          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f32, dst);
1456       } else if (dst.size() == 2) {
1457          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
1458       } else {
1459          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1460          nir_print_instr(&instr->instr, stderr);
1461          fprintf(stderr, "\n");
1462       }
1463       break;
1464    }
1465    case nir_op_fexp2: {
1466       if (dst.size() == 1) {
1467          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
1468       } else {
1469          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1470          nir_print_instr(&instr->instr, stderr);
1471          fprintf(stderr, "\n");
1472       }
1473       break;
1474    }
1475    case nir_op_fsqrt: {
1476       if (dst.size() == 1) {
1477          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f32, dst);
1478       } else if (dst.size() == 2) {
1479          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
1480       } else {
1481          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1482          nir_print_instr(&instr->instr, stderr);
1483          fprintf(stderr, "\n");
1484       }
1485       break;
1486    }
1487    case nir_op_ffract: {
1488       if (dst.size() == 1) {
1489          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
1490       } else if (dst.size() == 2) {
1491          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
1492       } else {
1493          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1494          nir_print_instr(&instr->instr, stderr);
1495          fprintf(stderr, "\n");
1496       }
1497       break;
1498    }
1499    case nir_op_ffloor: {
1500       if (dst.size() == 1) {
1501          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
1502       } else if (dst.size() == 2) {
1503          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f64, dst);
1504       } else {
1505          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1506          nir_print_instr(&instr->instr, stderr);
1507          fprintf(stderr, "\n");
1508       }
1509       break;
1510    }
1511    case nir_op_fceil: {
1512       if (dst.size() == 1) {
1513          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
1514       } else if (dst.size() == 2) {
1515          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst);
1516       } else {
1517          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1518          nir_print_instr(&instr->instr, stderr);
1519          fprintf(stderr, "\n");
1520       }
1521       break;
1522    }
1523    case nir_op_ftrunc: {
1524       if (dst.size() == 1) {
1525          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
1526       } else if (dst.size() == 2) {
1527          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f64, dst);
1528       } else {
1529          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1530          nir_print_instr(&instr->instr, stderr);
1531          fprintf(stderr, "\n");
1532       }
1533       break;
1534    }
1535    case nir_op_fround_even: {
1536       if (dst.size() == 1) {
1537          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
1538       } else if (dst.size() == 2) {
1539          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst);
1540       } else {
1541          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1542          nir_print_instr(&instr->instr, stderr);
1543          fprintf(stderr, "\n");
1544       }
1545       break;
1546    }
1547    case nir_op_fsin:
1548    case nir_op_fcos: {
1549       Temp src = get_alu_src(ctx, instr->src[0]);
1550       aco_ptr<Instruction> norm;
1551       if (dst.size() == 1) {
1552          Temp tmp;
1553          Operand half_pi(0x3e22f983u);
1554          if (src.type() == RegType::sgpr)
1555             tmp = bld.vop2_e64(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1556          else
1557             tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
1558
1559          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
1560          if (ctx->options->chip_class < GFX9)
1561             tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp);
1562
1563          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
1564          bld.vop1(opcode, Definition(dst), tmp);
1565       } else {
1566          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1567          nir_print_instr(&instr->instr, stderr);
1568          fprintf(stderr, "\n");
1569       }
1570       break;
1571    }
1572    case nir_op_ldexp: {
1573       if (dst.size() == 1) {
1574          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst),
1575                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1576                   get_alu_src(ctx, instr->src[1]));
1577       } else if (dst.size() == 2) {
1578          bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst),
1579                   as_vgpr(ctx, get_alu_src(ctx, instr->src[0])),
1580                   get_alu_src(ctx, instr->src[1]));
1581       } else {
1582          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1583          nir_print_instr(&instr->instr, stderr);
1584          fprintf(stderr, "\n");
1585       }
1586       break;
1587    }
1588    case nir_op_frexp_sig: {
1589       if (dst.size() == 1) {
1590          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst),
1591                   get_alu_src(ctx, instr->src[0]));
1592       } else if (dst.size() == 2) {
1593          bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst),
1594                   get_alu_src(ctx, instr->src[0]));
1595       } else {
1596          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1597          nir_print_instr(&instr->instr, stderr);
1598          fprintf(stderr, "\n");
1599       }
1600       break;
1601    }
1602    case nir_op_frexp_exp: {
1603       if (instr->src[0].src.ssa->bit_size == 32) {
1604          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst),
1605                   get_alu_src(ctx, instr->src[0]));
1606       } else if (instr->src[0].src.ssa->bit_size == 64) {
1607          bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst),
1608                   get_alu_src(ctx, instr->src[0]));
1609       } else {
1610          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1611          nir_print_instr(&instr->instr, stderr);
1612          fprintf(stderr, "\n");
1613       }
1614       break;
1615    }
1616    case nir_op_fsign: {
1617       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
1618       if (dst.size() == 1) {
1619          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1620          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
1621          cond = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1622          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0xbf800000u), src, cond);
1623       } else if (dst.size() == 2) {
1624          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1625          Temp tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0x3FF00000u));
1626          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, src, cond);
1627
1628          cond = bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(s2)), Operand(0u), src);
1629          tmp = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0xBFF00000u));
1630          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond);
1631
1632          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1633       } else {
1634          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1635          nir_print_instr(&instr->instr, stderr);
1636          fprintf(stderr, "\n");
1637       }
1638       break;
1639    }
1640    case nir_op_f2f32: {
1641       if (instr->src[0].src.ssa->bit_size == 64) {
1642          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
1643       } else {
1644          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1645          nir_print_instr(&instr->instr, stderr);
1646          fprintf(stderr, "\n");
1647       }
1648       break;
1649    }
1650    case nir_op_f2f64: {
1651       if (instr->src[0].src.ssa->bit_size == 32) {
1652          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_f32, dst);
1653       } else {
1654          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1655          nir_print_instr(&instr->instr, stderr);
1656          fprintf(stderr, "\n");
1657       }
1658       break;
1659    }
1660    case nir_op_i2f32: {
1661       assert(dst.size() == 1);
1662       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_i32, dst);
1663       break;
1664    }
1665    case nir_op_i2f64: {
1666       if (instr->src[0].src.ssa->bit_size == 32) {
1667          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_i32, dst);
1668       } else if (instr->src[0].src.ssa->bit_size == 64) {
1669          Temp src = get_alu_src(ctx, instr->src[0]);
1670          RegClass rc = RegClass(src.type(), 1);
1671          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1672          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1673          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1674          upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper);
1675          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1676          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1677
1678       } else {
1679          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1680          nir_print_instr(&instr->instr, stderr);
1681          fprintf(stderr, "\n");
1682       }
1683       break;
1684    }
1685    case nir_op_u2f32: {
1686       assert(dst.size() == 1);
1687       emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_u32, dst);
1688       break;
1689    }
1690    case nir_op_u2f64: {
1691       if (instr->src[0].src.ssa->bit_size == 32) {
1692          emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f64_u32, dst);
1693       } else if (instr->src[0].src.ssa->bit_size == 64) {
1694          Temp src = get_alu_src(ctx, instr->src[0]);
1695          RegClass rc = RegClass(src.type(), 1);
1696          Temp lower = bld.tmp(rc), upper = bld.tmp(rc);
1697          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src);
1698          lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower);
1699          upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper);
1700          upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
1701          bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
1702       } else {
1703          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1704          nir_print_instr(&instr->instr, stderr);
1705          fprintf(stderr, "\n");
1706       }
1707       break;
1708    }
1709    case nir_op_f2i32: {
1710       Temp src = get_alu_src(ctx, instr->src[0]);
1711       if (instr->src[0].src.ssa->bit_size == 32) {
1712          if (dst.type() == RegType::vgpr)
1713             bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
1714          else
1715             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1716                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
1717
1718       } else if (instr->src[0].src.ssa->bit_size == 64) {
1719          if (dst.type() == RegType::vgpr)
1720             bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
1721          else
1722             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1723                        bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
1724
1725       } else {
1726          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1727          nir_print_instr(&instr->instr, stderr);
1728          fprintf(stderr, "\n");
1729       }
1730       break;
1731    }
1732    case nir_op_f2u32: {
1733       Temp src = get_alu_src(ctx, instr->src[0]);
1734       if (instr->src[0].src.ssa->bit_size == 32) {
1735          if (dst.type() == RegType::vgpr)
1736             bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
1737          else
1738             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1739                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
1740
1741       } else if (instr->src[0].src.ssa->bit_size == 64) {
1742          if (dst.type() == RegType::vgpr)
1743             bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
1744          else
1745             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
1746                        bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
1747
1748       } else {
1749          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1750          nir_print_instr(&instr->instr, stderr);
1751          fprintf(stderr, "\n");
1752       }
1753       break;
1754    }
1755    case nir_op_f2i64: {
1756       Temp src = get_alu_src(ctx, instr->src[0]);
1757       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1758          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1759          exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand(0x0u), exponent, Operand(64u));
1760          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1761          Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), src);
1762          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1763          mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(7u), mantissa);
1764          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1765          Temp new_exponent = bld.tmp(v1);
1766          Temp borrow = bld.vsub32(Definition(new_exponent), Operand(63u), exponent, true).def(1).getTemp();
1767          mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa);
1768          Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand(0xfffffffeu));
1769          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1770          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1771          lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, Operand(0xffffffffu), borrow);
1772          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow);
1773          lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower);
1774          upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper);
1775          Temp new_lower = bld.tmp(v1);
1776          borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp();
1777          Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow);
1778          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper);
1779
1780       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1781          if (src.type() == RegType::vgpr)
1782             src = bld.as_uniform(src);
1783          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1784          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1785          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1786          exponent = bld.sop2(aco_opcode::s_min_u32, bld.def(s1), bld.def(s1, scc), Operand(64u), exponent);
1787          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1788          Temp sign = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand(31u));
1789          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1790          mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, Operand(7u));
1791          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1792          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(63u), exponent);
1793          mantissa = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent);
1794          Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, Operand(0xffffffffu)); // exp >= 64
1795          Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand(0xfffffffeu));
1796          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond);
1797          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1798          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1799          lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower);
1800          upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper);
1801          Temp borrow = bld.tmp(s1);
1802          lower = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign);
1803          upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, borrow);
1804          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1805
1806       } else if (instr->src[0].src.ssa->bit_size == 64) {
1807          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1808          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1809          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1810          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1811          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1812          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1813          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1814          Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor);
1815          if (dst.type() == RegType::sgpr) {
1816             lower = bld.as_uniform(lower);
1817             upper = bld.as_uniform(upper);
1818          }
1819          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1820
1821       } else {
1822          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1823          nir_print_instr(&instr->instr, stderr);
1824          fprintf(stderr, "\n");
1825       }
1826       break;
1827    }
1828    case nir_op_f2u64: {
1829       Temp src = get_alu_src(ctx, instr->src[0]);
1830       if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::vgpr) {
1831          Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src);
1832          Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(s2)), Operand(64u), exponent);
1833          exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand(0x0u), exponent);
1834          Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffu), src);
1835          mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(0x800000u), mantissa);
1836          Temp exponent_small = bld.vsub32(bld.def(v1), Operand(24u), exponent);
1837          Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa);
1838          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(0u), mantissa);
1839          Temp new_exponent = bld.tmp(v1);
1840          Temp cond_small = bld.vsub32(Definition(new_exponent), exponent, Operand(24u), true).def(1).getTemp();
1841          mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa);
1842          Temp lower = bld.tmp(v1), upper = bld.tmp(v1);
1843          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1844          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small);
1845          upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand(0u), cond_small);
1846          lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), lower, exponent_in_range);
1847          upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xffffffffu), upper, exponent_in_range);
1848          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1849
1850       } else if (instr->src[0].src.ssa->bit_size == 32 && dst.type() == RegType::sgpr) {
1851          if (src.type() == RegType::vgpr)
1852             src = bld.as_uniform(src);
1853          Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, Operand(0x80017u));
1854          exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(126u));
1855          exponent = bld.sop2(aco_opcode::s_max_u32, bld.def(s1), bld.def(s1, scc), Operand(0u), exponent);
1856          Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0x7fffffu), src);
1857          mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(0x800000u), mantissa);
1858          Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), Operand(24u), exponent);
1859          Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, exponent_small);
1860          mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), mantissa);
1861          Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), exponent, Operand(24u));
1862          mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent_large);
1863          Temp cond = bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand(64u), exponent);
1864          mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, Operand(0xffffffffu), cond);
1865          Temp lower = bld.tmp(s1), upper = bld.tmp(s1);
1866          bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa);
1867          Temp cond_small = bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand(24u));
1868          lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small);
1869          upper = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand(0u), upper, cond_small);
1870          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1871
1872       } else if (instr->src[0].src.ssa->bit_size == 64) {
1873          Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0x3df00000u));
1874          Temp trunc = bld.vop1(aco_opcode::v_trunc_f64, bld.def(v2), src);
1875          Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec);
1876          vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(0u), Operand(0xc1f00000u));
1877          Temp floor  = bld.vop1(aco_opcode::v_floor_f64, bld.def(v2), mul);
1878          Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc);
1879          Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma);
1880          Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor);
1881          if (dst.type() == RegType::sgpr) {
1882             lower = bld.as_uniform(lower);
1883             upper = bld.as_uniform(upper);
1884          }
1885          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
1886
1887       } else {
1888          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1889          nir_print_instr(&instr->instr, stderr);
1890          fprintf(stderr, "\n");
1891       }
1892       break;
1893    }
1894    case nir_op_b2f32: {
1895       Temp src = get_alu_src(ctx, instr->src[0]);
1896       if (dst.regClass() == s1) {
1897          src = as_uniform_bool(ctx, src);
1898          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3f800000u), src);
1899       } else if (dst.regClass() == v1) {
1900          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(0x3f800000u),
1901                       as_divergent_bool(ctx, src, true));
1902       } else {
1903          unreachable("Wrong destination register class for nir_op_b2f32.");
1904       }
1905       break;
1906    }
1907    case nir_op_b2f64: {
1908       Temp src = get_alu_src(ctx, instr->src[0]);
1909       if (dst.regClass() == s2) {
1910          src = as_uniform_bool(ctx, src);
1911          bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand(0x3f800000u), Operand(0u), bld.scc(src));
1912       } else if (dst.regClass() == v2) {
1913          Temp one = bld.vop1(aco_opcode::v_mov_b32, bld.def(v2), Operand(0x3FF00000u));
1914          Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one,
1915                       as_divergent_bool(ctx, src, true));
1916          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
1917       } else {
1918          unreachable("Wrong destination register class for nir_op_b2f64.");
1919       }
1920       break;
1921    }
1922    case nir_op_i2i32: {
1923       Temp src = get_alu_src(ctx, instr->src[0]);
1924       if (instr->src[0].src.ssa->bit_size == 64) {
1925          /* we can actually just say dst = src, as it would map the lower register */
1926          emit_extract_vector(ctx, src, 0, dst);
1927       } else {
1928          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1929          nir_print_instr(&instr->instr, stderr);
1930          fprintf(stderr, "\n");
1931       }
1932       break;
1933    }
1934    case nir_op_u2u32: {
1935       Temp src = get_alu_src(ctx, instr->src[0]);
1936       if (instr->src[0].src.ssa->bit_size == 16) {
1937          if (dst.regClass() == s1) {
1938             bld.sop2(aco_opcode::s_and_b32, Definition(dst), bld.def(s1, scc), Operand(0xFFFFu), src);
1939          } else {
1940             // TODO: do better with SDWA
1941             bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0xFFFFu), src);
1942          }
1943       } else if (instr->src[0].src.ssa->bit_size == 64) {
1944          /* we can actually just say dst = src, as it would map the lower register */
1945          emit_extract_vector(ctx, src, 0, dst);
1946       } else {
1947          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1948          nir_print_instr(&instr->instr, stderr);
1949          fprintf(stderr, "\n");
1950       }
1951       break;
1952    }
1953    case nir_op_i2i64: {
1954       Temp src = get_alu_src(ctx, instr->src[0]);
1955       if (instr->src[0].src.ssa->bit_size == 32) {
1956          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1957       } else {
1958          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1959          nir_print_instr(&instr->instr, stderr);
1960          fprintf(stderr, "\n");
1961       }
1962       break;
1963    }
1964    case nir_op_u2u64: {
1965       Temp src = get_alu_src(ctx, instr->src[0]);
1966       if (instr->src[0].src.ssa->bit_size == 32) {
1967          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src, Operand(0u));
1968       } else {
1969          fprintf(stderr, "Unimplemented NIR instr bit size: ");
1970          nir_print_instr(&instr->instr, stderr);
1971          fprintf(stderr, "\n");
1972       }
1973       break;
1974    }
1975    case nir_op_b2i32: {
1976       Temp src = get_alu_src(ctx, instr->src[0]);
1977       if (dst.regClass() == s1) {
1978          if (src.regClass() == s1) {
1979             bld.copy(Definition(dst), src);
1980          } else {
1981             // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
1982             assert(src.regClass() == s2);
1983             bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(Definition(dst)), Operand(0u), src);
1984          }
1985       } else {
1986          assert(dst.regClass() == v1 && src.regClass() == s2);
1987          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
1988       }
1989       break;
1990    }
1991    case nir_op_i2b1: {
1992       Temp src = get_alu_src(ctx, instr->src[0]);
1993       if (dst.regClass() == s2) {
1994          assert(src.regClass() == v1 || src.regClass() == v2);
1995          bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32,
1996                   Definition(dst), Operand(0u), src).def(0).setHint(vcc);
1997       } else {
1998          assert(src.regClass() == s1 && dst.regClass() == s1);
1999          bld.sopc(aco_opcode::s_cmp_lg_u32, bld.scc(Definition(dst)), Operand(0u), src);
2000       }
2001       break;
2002    }
2003    case nir_op_pack_64_2x32_split: {
2004       Temp src0 = get_alu_src(ctx, instr->src[0]);
2005       Temp src1 = get_alu_src(ctx, instr->src[1]);
2006
2007       bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1);
2008       break;
2009    }
2010    case nir_op_unpack_64_2x32_split_x:
2011       bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), get_alu_src(ctx, instr->src[0]));
2012       break;
2013    case nir_op_unpack_64_2x32_split_y:
2014       bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), get_alu_src(ctx, instr->src[0]));
2015       break;
2016    case nir_op_pack_half_2x16: {
2017       Temp src = get_alu_src(ctx, instr->src[0], 2);
2018
2019       if (dst.regClass() == v1) {
2020          Temp src0 = bld.tmp(v1);
2021          Temp src1 = bld.tmp(v1);
2022          bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
2023          bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
2024
2025       } else {
2026          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2027          nir_print_instr(&instr->instr, stderr);
2028          fprintf(stderr, "\n");
2029       }
2030       break;
2031    }
2032    case nir_op_unpack_half_2x16_split_x: {
2033       if (dst.regClass() == v1) {
2034          Builder bld(ctx->program, ctx->block);
2035          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
2036       } else {
2037          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2038          nir_print_instr(&instr->instr, stderr);
2039          fprintf(stderr, "\n");
2040       }
2041       break;
2042    }
2043    case nir_op_unpack_half_2x16_split_y: {
2044       if (dst.regClass() == v1) {
2045          Builder bld(ctx->program, ctx->block);
2046          /* TODO: use SDWA here */
2047          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
2048                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
2049       } else {
2050          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2051          nir_print_instr(&instr->instr, stderr);
2052          fprintf(stderr, "\n");
2053       }
2054       break;
2055    }
2056    case nir_op_fquantize2f16: {
2057       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), get_alu_src(ctx, instr->src[0]));
2058
2059       Temp mask = bld.copy(bld.def(s1), Operand(0x36Fu)); /* value is NOT negative/positive denormal value */
2060
2061       Temp cmp_res = bld.tmp(s2);
2062       bld.vopc_e64(aco_opcode::v_cmp_class_f16, Definition(cmp_res), f16, mask).def(0).setHint(vcc);
2063
2064       Temp f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16);
2065
2066       bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), f32, cmp_res);
2067       break;
2068    }
2069    case nir_op_bfm: {
2070       Temp bits = get_alu_src(ctx, instr->src[0]);
2071       Temp offset = get_alu_src(ctx, instr->src[1]);
2072
2073       if (dst.regClass() == s1) {
2074          bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset);
2075       } else if (dst.regClass() == v1) {
2076          bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
2077       } else {
2078          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2079          nir_print_instr(&instr->instr, stderr);
2080          fprintf(stderr, "\n");
2081       }
2082       break;
2083    }
2084    case nir_op_bitfield_select: {
2085       /* (mask & insert) | (~mask & base) */
2086       Temp bitmask = get_alu_src(ctx, instr->src[0]);
2087       Temp insert = get_alu_src(ctx, instr->src[1]);
2088       Temp base = get_alu_src(ctx, instr->src[2]);
2089
2090       /* dst = (insert & bitmask) | (base & ~bitmask) */
2091       if (dst.regClass() == s1) {
2092          aco_ptr<Instruction> sop2;
2093          nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src);
2094          nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src);
2095          Operand lhs;
2096          if (const_insert && const_bitmask) {
2097             lhs = Operand(const_insert->u32 & const_bitmask->u32);
2098          } else {
2099             insert = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask);
2100             lhs = Operand(insert);
2101          }
2102
2103          Operand rhs;
2104          nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src);
2105          if (const_base && const_bitmask) {
2106             rhs = Operand(const_base->u32 & ~const_bitmask->u32);
2107          } else {
2108             base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask);
2109             rhs = Operand(base);
2110          }
2111
2112          bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs);
2113
2114       } else if (dst.regClass() == v1) {
2115          if (base.type() == RegType::sgpr && (bitmask.type() == RegType::sgpr || (insert.type() == RegType::sgpr)))
2116             base = as_vgpr(ctx, base);
2117          if (insert.type() == RegType::sgpr && bitmask.type() == RegType::sgpr)
2118             insert = as_vgpr(ctx, insert);
2119
2120          bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
2121
2122       } else {
2123          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2124          nir_print_instr(&instr->instr, stderr);
2125          fprintf(stderr, "\n");
2126       }
2127       break;
2128    }
2129    case nir_op_ubfe:
2130    case nir_op_ibfe: {
2131       Temp base = get_alu_src(ctx, instr->src[0]);
2132       Temp offset = get_alu_src(ctx, instr->src[1]);
2133       Temp bits = get_alu_src(ctx, instr->src[2]);
2134
2135       if (dst.type() == RegType::sgpr) {
2136          Operand extract;
2137          nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src);
2138          nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src);
2139          if (const_offset && const_bits) {
2140             uint32_t const_extract = (const_bits->u32 << 16) | const_offset->u32;
2141             extract = Operand(const_extract);
2142          } else {
2143             Operand width;
2144             if (const_bits) {
2145                width = Operand(const_bits->u32 << 16);
2146             } else {
2147                width = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), bits, Operand(16u));
2148             }
2149             extract = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), offset, width);
2150          }
2151
2152          aco_opcode opcode;
2153          if (dst.regClass() == s1) {
2154             if (instr->op == nir_op_ubfe)
2155                opcode = aco_opcode::s_bfe_u32;
2156             else
2157                opcode = aco_opcode::s_bfe_i32;
2158          } else if (dst.regClass() == s2) {
2159             if (instr->op == nir_op_ubfe)
2160                opcode = aco_opcode::s_bfe_u64;
2161             else
2162                opcode = aco_opcode::s_bfe_i64;
2163          } else {
2164             unreachable("Unsupported BFE bit size");
2165          }
2166
2167          bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, extract);
2168
2169       } else {
2170          aco_opcode opcode;
2171          if (dst.regClass() == v1) {
2172             if (instr->op == nir_op_ubfe)
2173                opcode = aco_opcode::v_bfe_u32;
2174             else
2175                opcode = aco_opcode::v_bfe_i32;
2176          } else {
2177             unreachable("Unsupported BFE bit size");
2178          }
2179
2180          emit_vop3a_instruction(ctx, instr, opcode, dst);
2181       }
2182       break;
2183    }
2184    case nir_op_bit_count: {
2185       Temp src = get_alu_src(ctx, instr->src[0]);
2186       if (src.regClass() == s1) {
2187          bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src);
2188       } else if (src.regClass() == v1) {
2189          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand(0u));
2190       } else if (src.regClass() == v2) {
2191          bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst),
2192                   emit_extract_vector(ctx, src, 1, v1),
2193                   bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1),
2194                            emit_extract_vector(ctx, src, 0, v1), Operand(0u)));
2195       } else if (src.regClass() == s2) {
2196          bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
2197       } else {
2198          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2199          nir_print_instr(&instr->instr, stderr);
2200          fprintf(stderr, "\n");
2201       }
2202       break;
2203    }
2204    case nir_op_flt: {
2205       if (instr->src[0].src.ssa->bit_size == 32)
2206          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f32, dst);
2207       else if (instr->src[0].src.ssa->bit_size == 64)
2208          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_f64, dst);
2209       break;
2210    }
2211    case nir_op_fge: {
2212       if (instr->src[0].src.ssa->bit_size == 32)
2213          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f32, dst);
2214       else if (instr->src[0].src.ssa->bit_size == 64)
2215          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_f64, dst);
2216       break;
2217    }
2218    case nir_op_feq: {
2219       if (instr->src[0].src.ssa->bit_size == 32)
2220          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f32, dst);
2221       else if (instr->src[0].src.ssa->bit_size == 64)
2222          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_f64, dst);
2223       break;
2224    }
2225    case nir_op_fne: {
2226       if (instr->src[0].src.ssa->bit_size == 32)
2227          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f32, dst);
2228       else if (instr->src[0].src.ssa->bit_size == 64)
2229          emit_comparison(ctx, instr, aco_opcode::v_cmp_neq_f64, dst);
2230       break;
2231    }
2232    case nir_op_ilt: {
2233       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2234          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i32, dst);
2235       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2236          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_i32, dst);
2237       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2238          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_i64, dst);
2239       break;
2240    }
2241    case nir_op_ige: {
2242       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2243          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i32, dst);
2244       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2245          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_i32, dst);
2246       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2247          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_i64, dst);
2248       break;
2249    }
2250    case nir_op_ieq: {
2251       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2252          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i32, dst);
2253       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2254          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_i32, dst);
2255       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2256          emit_comparison(ctx, instr, aco_opcode::v_cmp_eq_i64, dst);
2257       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2258          emit_comparison(ctx, instr, aco_opcode::s_cmp_eq_u64, dst);
2259       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2260          Temp src0 = get_alu_src(ctx, instr->src[0]);
2261          Temp src1 = get_alu_src(ctx, instr->src[1]);
2262          bld.sopc(aco_opcode::s_cmp_eq_i32, bld.scc(Definition(dst)),
2263                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2264       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2265          Temp src0 = get_alu_src(ctx, instr->src[0]);
2266          Temp src1 = get_alu_src(ctx, instr->src[1]);
2267          bld.sop2(aco_opcode::s_xnor_b64, Definition(dst), bld.def(s1, scc),
2268                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2269       } else {
2270          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2271          nir_print_instr(&instr->instr, stderr);
2272          fprintf(stderr, "\n");
2273       }
2274       break;
2275    }
2276    case nir_op_ine: {
2277       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32) {
2278          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i32, dst);
2279       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64) {
2280          emit_comparison(ctx, instr, aco_opcode::v_cmp_lg_i64, dst);
2281       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32) {
2282          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_i32, dst);
2283       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 64) {
2284          emit_comparison(ctx, instr, aco_opcode::s_cmp_lg_u64, dst);
2285       } else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 1) {
2286          Temp src0 = get_alu_src(ctx, instr->src[0]);
2287          Temp src1 = get_alu_src(ctx, instr->src[1]);
2288          bld.sopc(aco_opcode::s_cmp_lg_i32, bld.scc(Definition(dst)),
2289                   as_uniform_bool(ctx, src0), as_uniform_bool(ctx, src1));
2290       } else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 1) {
2291          Temp src0 = get_alu_src(ctx, instr->src[0]);
2292          Temp src1 = get_alu_src(ctx, instr->src[1]);
2293          bld.sop2(aco_opcode::s_xor_b64, Definition(dst), bld.def(s1, scc),
2294                   as_divergent_bool(ctx, src0, false), as_divergent_bool(ctx, src1, false));
2295       } else {
2296          fprintf(stderr, "Unimplemented NIR instr bit size: ");
2297          nir_print_instr(&instr->instr, stderr);
2298          fprintf(stderr, "\n");
2299       }
2300       break;
2301    }
2302    case nir_op_ult: {
2303       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2304          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u32, dst);
2305       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2306          emit_comparison(ctx, instr, aco_opcode::s_cmp_lt_u32, dst);
2307       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2308          emit_comparison(ctx, instr, aco_opcode::v_cmp_lt_u64, dst);
2309       break;
2310    }
2311    case nir_op_uge: {
2312       if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 32)
2313          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u32, dst);
2314       else if (dst.regClass() == s1 && instr->src[0].src.ssa->bit_size == 32)
2315          emit_comparison(ctx, instr, aco_opcode::s_cmp_ge_u32, dst);
2316       else if (dst.regClass() == s2 && instr->src[0].src.ssa->bit_size == 64)
2317          emit_comparison(ctx, instr, aco_opcode::v_cmp_ge_u64, dst);
2318       break;
2319    }
2320    case nir_op_fddx:
2321    case nir_op_fddy:
2322    case nir_op_fddx_fine:
2323    case nir_op_fddy_fine:
2324    case nir_op_fddx_coarse:
2325    case nir_op_fddy_coarse: {
2326       Definition tl = bld.def(v1);
2327       uint16_t dpp_ctrl;
2328       if (instr->op == nir_op_fddx_fine) {
2329          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 2, 2));
2330          dpp_ctrl = dpp_quad_perm(1, 1, 3, 3);
2331       } else if (instr->op == nir_op_fddy_fine) {
2332          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 1, 0, 1));
2333          dpp_ctrl = dpp_quad_perm(2, 3, 2, 3);
2334       } else {
2335          bld.vop1_dpp(aco_opcode::v_mov_b32, tl, get_alu_src(ctx, instr->src[0]), dpp_quad_perm(0, 0, 0, 0));
2336          if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse)
2337             dpp_ctrl = dpp_quad_perm(1, 1, 1, 1);
2338          else
2339             dpp_ctrl = dpp_quad_perm(2, 2, 2, 2);
2340       }
2341
2342       Definition tmp = bld.def(v1);
2343       bld.vop2_dpp(aco_opcode::v_sub_f32, tmp, get_alu_src(ctx, instr->src[0]), tl.getTemp(), dpp_ctrl);
2344       emit_wqm(ctx, tmp.getTemp(), dst, true);
2345       break;
2346    }
2347    default:
2348       fprintf(stderr, "Unknown NIR ALU instr: ");
2349       nir_print_instr(&instr->instr, stderr);
2350       fprintf(stderr, "\n");
2351    }
2352 }
2353
2354 void visit_load_const(isel_context *ctx, nir_load_const_instr *instr)
2355 {
2356    Temp dst = get_ssa_temp(ctx, &instr->def);
2357
2358    // TODO: we really want to have the resulting type as this would allow for 64bit literals
2359    // which get truncated the lsb if double and msb if int
2360    // for now, we only use s_mov_b64 with 64bit inline constants
2361    assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar.");
2362    assert(dst.type() == RegType::sgpr);
2363
2364    if (dst.size() == 1)
2365    {
2366       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(instr->value[0].u32));
2367    } else {
2368       assert(dst.size() != 1);
2369       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2370       if (instr->def.bit_size == 64)
2371          for (unsigned i = 0; i < dst.size(); i++)
2372             vec->operands[i] = Operand{(uint32_t)(instr->value[0].u64 >> i * 32)};
2373       else {
2374          for (unsigned i = 0; i < dst.size(); i++)
2375             vec->operands[i] = Operand{instr->value[i].u32};
2376       }
2377       vec->definitions[0] = Definition(dst);
2378       ctx->block->instructions.emplace_back(std::move(vec));
2379    }
2380 }
2381
2382 uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2383 {
2384    uint32_t new_mask = 0;
2385    for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2386       if (mask & (1u << i))
2387          new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2388    return new_mask;
2389 }
2390
2391 void visit_store_vs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2392 {
2393    /* This wouldn't work inside control flow or with indirect offsets but
2394     * that doesn't happen because of nir_lower_io_to_temporaries(). */
2395
2396    unsigned write_mask = nir_intrinsic_write_mask(instr);
2397    unsigned component = nir_intrinsic_component(instr);
2398    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2399    unsigned idx = nir_intrinsic_base(instr) + component;
2400
2401    nir_instr *off_instr = instr->src[1].ssa->parent_instr;
2402    if (off_instr->type != nir_instr_type_load_const) {
2403       fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2404       nir_print_instr(off_instr, stderr);
2405       fprintf(stderr, "\n");
2406    }
2407    idx += nir_instr_as_load_const(off_instr)->value[0].u32 * 4u;
2408
2409    if (instr->src[0].ssa->bit_size == 64)
2410       write_mask = widen_mask(write_mask, 2);
2411
2412    for (unsigned i = 0; i < 8; ++i) {
2413       if (write_mask & (1 << i)) {
2414          ctx->vs_output.mask[idx / 4u] |= 1 << (idx % 4u);
2415          ctx->vs_output.outputs[idx / 4u][idx % 4u] = emit_extract_vector(ctx, src, i, v1);
2416       }
2417       idx++;
2418    }
2419 }
2420
2421 void visit_store_fs_output(isel_context *ctx, nir_intrinsic_instr *instr)
2422 {
2423    unsigned write_mask = nir_intrinsic_write_mask(instr);
2424    Operand values[4];
2425    Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
2426    for (unsigned i = 0; i < 4; ++i) {
2427       if (write_mask & (1 << i)) {
2428          Temp tmp = emit_extract_vector(ctx, src, i, v1);
2429          values[i] = Operand(tmp);
2430       } else {
2431          values[i] = Operand(v1);
2432       }
2433    }
2434
2435    unsigned index = nir_intrinsic_base(instr) / 4;
2436    unsigned target, col_format;
2437    unsigned enabled_channels = 0xF;
2438    aco_opcode compr_op = (aco_opcode)0;
2439
2440    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2441    assert(offset && "Non-const offsets on exports not yet supported");
2442    index += offset->u32;
2443
2444    assert(index != FRAG_RESULT_COLOR);
2445
2446    /* Unlike vertex shader exports, it's fine to use multiple exports to
2447     * export separate channels of one target. So shaders which export both
2448     * FRAG_RESULT_SAMPLE_MASK and FRAG_RESULT_DEPTH should work fine.
2449     * TODO: combine the exports in those cases and create better code
2450     */
2451
2452    if (index == FRAG_RESULT_SAMPLE_MASK) {
2453
2454       if (ctx->program->info->ps.writes_z) {
2455          target = V_008DFC_SQ_EXP_MRTZ;
2456          enabled_channels = 0x4;
2457          col_format = (unsigned) -1;
2458
2459          values[2] = values[0];
2460          values[0] = Operand(v1);
2461       } else {
2462          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2463          exp->valid_mask = false;
2464          exp->done = false;
2465          exp->compressed = true;
2466          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2467          exp->enabled_mask = 0xc;
2468          for (int i = 0; i < 4; i++)
2469             exp->operands[i] = Operand(v1);
2470          exp->operands[1] = Operand(values[0]);
2471          ctx->block->instructions.emplace_back(std::move(exp));
2472          return;
2473       }
2474
2475    } else if (index == FRAG_RESULT_DEPTH) {
2476
2477       target = V_008DFC_SQ_EXP_MRTZ;
2478       enabled_channels = 0x1;
2479       col_format = (unsigned) -1;
2480
2481    } else if (index == FRAG_RESULT_STENCIL) {
2482
2483       if (ctx->program->info->ps.writes_z) {
2484          target = V_008DFC_SQ_EXP_MRTZ;
2485          enabled_channels = 0x2;
2486          col_format = (unsigned) -1;
2487
2488          values[1] = values[0];
2489          values[0] = Operand(v1);
2490       } else {
2491          aco_ptr<Instruction> shift{create_instruction<VOP2_instruction>(aco_opcode::v_lshlrev_b32, Format::VOP2, 2, 1)};
2492          shift->operands[0] = Operand((uint32_t) 16);
2493          shift->operands[1] = values[0];
2494          Temp tmp = {ctx->program->allocateId(), v1};
2495          shift->definitions[0] = Definition(tmp);
2496          ctx->block->instructions.emplace_back(std::move(shift));
2497
2498          aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2499          exp->valid_mask = false;
2500          exp->done = false;
2501          exp->compressed = true;
2502          exp->dest = V_008DFC_SQ_EXP_MRTZ;
2503          exp->enabled_mask = 0x3;
2504          exp->operands[0] = Operand(tmp);
2505          for (int i = 1; i < 4; i++)
2506             exp->operands[i] = Operand(v1);
2507          ctx->block->instructions.emplace_back(std::move(exp));
2508          return;
2509       }
2510
2511    } else {
2512       index -= FRAG_RESULT_DATA0;
2513       target = V_008DFC_SQ_EXP_MRT + index;
2514       col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
2515    }
2516    ASSERTED bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
2517    ASSERTED bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
2518    assert(!is_int8 && !is_int10);
2519
2520    switch (col_format)
2521    {
2522    case V_028714_SPI_SHADER_ZERO:
2523       enabled_channels = 0; /* writemask */
2524       target = V_008DFC_SQ_EXP_NULL;
2525       break;
2526
2527    case V_028714_SPI_SHADER_32_R:
2528       enabled_channels = 1;
2529       break;
2530
2531    case V_028714_SPI_SHADER_32_GR:
2532       enabled_channels = 0x3;
2533       break;
2534
2535    case V_028714_SPI_SHADER_32_AR:
2536       enabled_channels = 0x9;
2537       break;
2538
2539    case V_028714_SPI_SHADER_FP16_ABGR:
2540       enabled_channels = 0x5;
2541       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
2542       break;
2543
2544    case V_028714_SPI_SHADER_UNORM16_ABGR:
2545       enabled_channels = 0x5;
2546       compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
2547       break;
2548
2549    case V_028714_SPI_SHADER_SNORM16_ABGR:
2550       enabled_channels = 0x5;
2551       compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
2552       break;
2553
2554    case V_028714_SPI_SHADER_UINT16_ABGR:
2555       enabled_channels = 0x5;
2556       compr_op = aco_opcode::v_cvt_pk_u16_u32;
2557       break;
2558
2559    case V_028714_SPI_SHADER_SINT16_ABGR:
2560       enabled_channels = 0x5;
2561       compr_op = aco_opcode::v_cvt_pk_i16_i32;
2562       break;
2563
2564    case V_028714_SPI_SHADER_32_ABGR:
2565       enabled_channels = 0xF;
2566       break;
2567
2568    default:
2569       break;
2570    }
2571
2572    if (target == V_008DFC_SQ_EXP_NULL)
2573       return;
2574
2575    if ((bool)compr_op)
2576    {
2577       for (int i = 0; i < 2; i++)
2578       {
2579          /* check if at least one of the values to be compressed is enabled */
2580          unsigned enabled = (write_mask >> (i*2) | write_mask >> (i*2+1)) & 0x1;
2581          if (enabled) {
2582             enabled_channels |= enabled << (i*2);
2583             aco_ptr<VOP3A_instruction> compr{create_instruction<VOP3A_instruction>(compr_op, Format::VOP3A, 2, 1)};
2584             Temp tmp{ctx->program->allocateId(), v1};
2585             compr->operands[0] = values[i*2].isUndefined() ? Operand(0u) : values[i*2];
2586             compr->operands[1] = values[i*2+1].isUndefined() ? Operand(0u): values[i*2+1];
2587             compr->definitions[0] = Definition(tmp);
2588             values[i] = Operand(tmp);
2589             ctx->block->instructions.emplace_back(std::move(compr));
2590          } else {
2591             values[i] = Operand(v1);
2592          }
2593       }
2594    }
2595
2596    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
2597    exp->valid_mask = false;
2598    exp->done = false;
2599    exp->compressed = (bool) compr_op;
2600    exp->dest = target;
2601    exp->enabled_mask = enabled_channels;
2602    if ((bool) compr_op) {
2603       for (int i = 0; i < 2; i++)
2604          exp->operands[i] = enabled_channels & (3 << (i * 2)) ? values[i] : Operand(v1);
2605       exp->operands[2] = Operand(v1);
2606       exp->operands[3] = Operand(v1);
2607    } else {
2608       for (int i = 0; i < 4; i++)
2609          exp->operands[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1);
2610    }
2611
2612    ctx->block->instructions.emplace_back(std::move(exp));
2613 }
2614
2615 void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
2616 {
2617    if (ctx->stage == vertex_vs) {
2618       visit_store_vs_output(ctx, instr);
2619    } else if (ctx->stage == fragment_fs) {
2620       visit_store_fs_output(ctx, instr);
2621    } else {
2622       unreachable("Shader stage not implemented");
2623    }
2624 }
2625
2626 void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp src, Temp dst, Temp prim_mask)
2627 {
2628    Temp coord1 = emit_extract_vector(ctx, src, 0, v1);
2629    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
2630
2631    Builder bld(ctx->program, ctx->block);
2632    Temp tmp = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
2633    bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), tmp, idx, component);
2634 }
2635
2636 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
2637 {
2638    aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1));
2639    for (unsigned i = 0; i < num_components; i++)
2640       vec->operands[i] = Operand(ctx->fs_inputs[fs_input::frag_pos_0 + i]);
2641
2642    if (ctx->fs_vgpr_args[fs_input::frag_pos_3]) {
2643       assert(num_components == 4);
2644       Builder bld(ctx->program, ctx->block);
2645       vec->operands[3] = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ctx->fs_inputs[fs_input::frag_pos_3]);
2646    }
2647
2648    for (Operand& op : vec->operands)
2649       op = op.isUndefined() ? Operand(0u) : op;
2650
2651    vec->definitions[0] = Definition(dst);
2652    ctx->block->instructions.emplace_back(std::move(vec));
2653    emit_split_vector(ctx, dst, num_components);
2654    return;
2655 }
2656
2657 void visit_load_interpolated_input(isel_context *ctx, nir_intrinsic_instr *instr)
2658 {
2659    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2660    Temp coords = get_ssa_temp(ctx, instr->src[0].ssa);
2661    unsigned idx = nir_intrinsic_base(instr);
2662    unsigned component = nir_intrinsic_component(instr);
2663    Temp prim_mask = ctx->prim_mask;
2664
2665    nir_const_value* offset = nir_src_as_const_value(instr->src[1]);
2666    if (offset) {
2667       assert(offset->u32 == 0);
2668    } else {
2669       /* the lower 15bit of the prim_mask contain the offset into LDS
2670        * while the upper bits contain the number of prims */
2671       Temp offset_src = get_ssa_temp(ctx, instr->src[1].ssa);
2672       assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2673       Builder bld(ctx->program, ctx->block);
2674       Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2675       stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2676       stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2677       offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2678       prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2679    }
2680
2681    if (instr->dest.ssa.num_components == 1) {
2682       emit_interp_instr(ctx, idx, component, coords, dst, prim_mask);
2683    } else {
2684       aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1));
2685       for (unsigned i = 0; i < instr->dest.ssa.num_components; i++)
2686       {
2687          Temp tmp = {ctx->program->allocateId(), v1};
2688          emit_interp_instr(ctx, idx, component+i, coords, tmp, prim_mask);
2689          vec->operands[i] = Operand(tmp);
2690       }
2691       vec->definitions[0] = Definition(dst);
2692       ctx->block->instructions.emplace_back(std::move(vec));
2693    }
2694 }
2695
2696 unsigned get_num_channels_from_data_format(unsigned data_format)
2697 {
2698    switch (data_format) {
2699    case V_008F0C_BUF_DATA_FORMAT_8:
2700    case V_008F0C_BUF_DATA_FORMAT_16:
2701    case V_008F0C_BUF_DATA_FORMAT_32:
2702       return 1;
2703    case V_008F0C_BUF_DATA_FORMAT_8_8:
2704    case V_008F0C_BUF_DATA_FORMAT_16_16:
2705    case V_008F0C_BUF_DATA_FORMAT_32_32:
2706       return 2;
2707    case V_008F0C_BUF_DATA_FORMAT_10_11_11:
2708    case V_008F0C_BUF_DATA_FORMAT_11_11_10:
2709    case V_008F0C_BUF_DATA_FORMAT_32_32_32:
2710       return 3;
2711    case V_008F0C_BUF_DATA_FORMAT_8_8_8_8:
2712    case V_008F0C_BUF_DATA_FORMAT_10_10_10_2:
2713    case V_008F0C_BUF_DATA_FORMAT_2_10_10_10:
2714    case V_008F0C_BUF_DATA_FORMAT_16_16_16_16:
2715    case V_008F0C_BUF_DATA_FORMAT_32_32_32_32:
2716       return 4;
2717    default:
2718       break;
2719    }
2720
2721    return 4;
2722 }
2723
2724 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW.
2725  * so we may need to fix it up. */
2726 Temp adjust_vertex_fetch_alpha(isel_context *ctx, unsigned adjustment, Temp alpha)
2727 {
2728    Builder bld(ctx->program, ctx->block);
2729
2730    if (adjustment == RADV_ALPHA_ADJUST_SSCALED)
2731       alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha);
2732
2733    /* For the integer-like cases, do a natural sign extension.
2734     *
2735     * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0
2736     * and happen to contain 0, 1, 2, 3 as the two LSBs of the
2737     * exponent.
2738     */
2739    alpha = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(adjustment == RADV_ALPHA_ADJUST_SNORM ? 7u : 30u), alpha);
2740    alpha = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(30u), alpha);
2741
2742    /* Convert back to the right type. */
2743    if (adjustment == RADV_ALPHA_ADJUST_SNORM) {
2744       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2745       Temp clamp = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0xbf800000u), alpha);
2746       alpha = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0xbf800000u), alpha, clamp);
2747    } else if (adjustment == RADV_ALPHA_ADJUST_SSCALED) {
2748       alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha);
2749    }
2750
2751    return alpha;
2752 }
2753
2754 void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
2755 {
2756    Builder bld(ctx->program, ctx->block);
2757    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2758    if (ctx->stage & sw_vs) {
2759
2760       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2761       if (off_instr->type != nir_instr_type_load_const) {
2762          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2763          nir_print_instr(off_instr, stderr);
2764          fprintf(stderr, "\n");
2765       }
2766       uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
2767
2768       Temp vertex_buffers = convert_pointer_to_64_bit(ctx, ctx->vertex_buffers);
2769
2770       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
2771       unsigned component = nir_intrinsic_component(instr);
2772       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
2773       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
2774       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
2775       unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location];
2776
2777       unsigned dfmt = attrib_format & 0xf;
2778
2779       unsigned nfmt = (attrib_format >> 4) & 0x7;
2780       unsigned num_dfmt_channels = get_num_channels_from_data_format(dfmt);
2781       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component;
2782       unsigned num_channels = MIN2(util_last_bit(mask), num_dfmt_channels);
2783       unsigned alpha_adjust = (ctx->options->key.vs.alpha_adjust >> (location * 2)) & 3;
2784       bool post_shuffle = ctx->options->key.vs.post_shuffle & (1 << location);
2785       if (post_shuffle)
2786          num_channels = MAX2(num_channels, 3);
2787
2788       Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, Operand(attrib_binding * 16u));
2789
2790       Temp index;
2791       if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) {
2792          uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location];
2793          if (divisor) {
2794             ctx->needs_instance_id = true;
2795
2796             if (divisor != 1) {
2797                Temp divided = bld.tmp(v1);
2798                emit_v_div_u32(ctx, divided, as_vgpr(ctx, ctx->instance_id), divisor);
2799                index = bld.vadd32(bld.def(v1), ctx->start_instance, divided);
2800             } else {
2801                index = bld.vadd32(bld.def(v1), ctx->start_instance, ctx->instance_id);
2802             }
2803          } else {
2804             index = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), ctx->start_instance);
2805          }
2806       } else {
2807          index = bld.vadd32(bld.def(v1), ctx->base_vertex, ctx->vertex_id);
2808       }
2809
2810       if (attrib_stride != 0 && attrib_offset > attrib_stride) {
2811          index = bld.vadd32(bld.def(v1), Operand(attrib_offset / attrib_stride), index);
2812          attrib_offset = attrib_offset % attrib_stride;
2813       }
2814
2815       Operand soffset(0u);
2816       if (attrib_offset >= 4096) {
2817          soffset = bld.copy(bld.def(s1), Operand(attrib_offset));
2818          attrib_offset = 0;
2819       }
2820
2821       aco_opcode opcode;
2822       switch (num_channels) {
2823       case 1:
2824          opcode = aco_opcode::tbuffer_load_format_x;
2825          break;
2826       case 2:
2827          opcode = aco_opcode::tbuffer_load_format_xy;
2828          break;
2829       case 3:
2830          opcode = aco_opcode::tbuffer_load_format_xyz;
2831          break;
2832       case 4:
2833          opcode = aco_opcode::tbuffer_load_format_xyzw;
2834          break;
2835       default:
2836          unreachable("Unimplemented load_input vector size");
2837       }
2838
2839       Temp tmp = post_shuffle || num_channels != dst.size() || alpha_adjust != RADV_ALPHA_ADJUST_NONE || component ? bld.tmp(RegType::vgpr, num_channels) : dst;
2840
2841       aco_ptr<MTBUF_instruction> mubuf{create_instruction<MTBUF_instruction>(opcode, Format::MTBUF, 3, 1)};
2842       mubuf->operands[0] = Operand(index);
2843       mubuf->operands[1] = Operand(list);
2844       mubuf->operands[2] = soffset;
2845       mubuf->definitions[0] = Definition(tmp);
2846       mubuf->idxen = true;
2847       mubuf->can_reorder = true;
2848       mubuf->dfmt = dfmt;
2849       mubuf->nfmt = nfmt;
2850       assert(attrib_offset < 4096);
2851       mubuf->offset = attrib_offset;
2852       ctx->block->instructions.emplace_back(std::move(mubuf));
2853
2854       emit_split_vector(ctx, tmp, tmp.size());
2855
2856       if (tmp.id() != dst.id()) {
2857          bool is_float = nfmt != V_008F0C_BUF_NUM_FORMAT_UINT &&
2858                          nfmt != V_008F0C_BUF_NUM_FORMAT_SINT;
2859
2860          static const unsigned swizzle_normal[4] = {0, 1, 2, 3};
2861          static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3};
2862          const unsigned *swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal;
2863
2864          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2865          for (unsigned i = 0; i < dst.size(); i++) {
2866             unsigned idx = i + component;
2867             if (idx == 3 && alpha_adjust != RADV_ALPHA_ADJUST_NONE && num_channels >= 4) {
2868                Temp alpha = emit_extract_vector(ctx, tmp, swizzle[3], v1);
2869                vec->operands[3] = Operand(adjust_vertex_fetch_alpha(ctx, alpha_adjust, alpha));
2870             } else if (idx < num_channels) {
2871                vec->operands[i] = Operand(emit_extract_vector(ctx, tmp, swizzle[idx], v1));
2872             } else if (is_float && idx == 3) {
2873                vec->operands[i] = Operand(0x3f800000u);
2874             } else if (!is_float && idx == 3) {
2875                vec->operands[i] = Operand(1u);
2876             } else {
2877                vec->operands[i] = Operand(0u);
2878             }
2879          }
2880          vec->definitions[0] = Definition(dst);
2881          ctx->block->instructions.emplace_back(std::move(vec));
2882          emit_split_vector(ctx, dst, dst.size());
2883       }
2884
2885    } else if (ctx->stage == fragment_fs) {
2886       nir_instr *off_instr = instr->src[0].ssa->parent_instr;
2887       if (off_instr->type != nir_instr_type_load_const ||
2888           nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
2889          fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
2890          nir_print_instr(off_instr, stderr);
2891          fprintf(stderr, "\n");
2892       }
2893
2894       Temp prim_mask = ctx->prim_mask;
2895       nir_const_value* offset = nir_src_as_const_value(instr->src[0]);
2896       if (offset) {
2897          assert(offset->u32 == 0);
2898       } else {
2899          /* the lower 15bit of the prim_mask contain the offset into LDS
2900           * while the upper bits contain the number of prims */
2901          Temp offset_src = get_ssa_temp(ctx, instr->src[0].ssa);
2902          assert(offset_src.regClass() == s1 && "TODO: divergent offsets...");
2903          Builder bld(ctx->program, ctx->block);
2904          Temp stride = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), prim_mask, Operand(16u));
2905          stride = bld.sop1(aco_opcode::s_bcnt1_i32_b32, bld.def(s1), bld.def(s1, scc), stride);
2906          stride = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, Operand(48u));
2907          offset_src = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), stride, offset_src);
2908          prim_mask = bld.sop2(aco_opcode::s_add_i32, bld.def(s1, m0), bld.def(s1, scc), offset_src, prim_mask);
2909       }
2910
2911       unsigned idx = nir_intrinsic_base(instr);
2912       unsigned component = nir_intrinsic_component(instr);
2913
2914       if (dst.size() == 1) {
2915          bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand(2u), bld.m0(prim_mask), idx, component);
2916       } else {
2917          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
2918          for (unsigned i = 0; i < dst.size(); i++)
2919             vec->operands[i] = bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand(2u), bld.m0(prim_mask), idx, component + i);
2920          vec->definitions[0] = Definition(dst);
2921          bld.insert(std::move(vec));
2922       }
2923
2924    } else {
2925       unreachable("Shader stage not implemented");
2926    }
2927 }
2928
2929 Temp load_desc_ptr(isel_context *ctx, unsigned desc_set)
2930 {
2931    if (ctx->program->info->need_indirect_descriptor_sets) {
2932       Builder bld(ctx->program, ctx->block);
2933       Temp ptr64 = convert_pointer_to_64_bit(ctx, ctx->descriptor_sets[0]);
2934       return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, Operand(desc_set << 2));//, false, false, false);
2935    }
2936
2937    return ctx->descriptor_sets[desc_set];
2938 }
2939
2940
2941 void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
2942 {
2943    Builder bld(ctx->program, ctx->block);
2944    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
2945    unsigned desc_set = nir_intrinsic_desc_set(instr);
2946    unsigned binding = nir_intrinsic_binding(instr);
2947
2948    Temp desc_ptr;
2949    radv_pipeline_layout *pipeline_layout = ctx->options->layout;
2950    radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
2951    unsigned offset = layout->binding[binding].offset;
2952    unsigned stride;
2953    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
2954        layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
2955       unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
2956       desc_ptr = ctx->push_constants;
2957       offset = pipeline_layout->push_constant_size + 16 * idx;
2958       stride = 16;
2959    } else {
2960       desc_ptr = load_desc_ptr(ctx, desc_set);
2961       stride = layout->binding[binding].size;
2962    }
2963
2964    nir_const_value* nir_const_index = nir_src_as_const_value(instr->src[0]);
2965    unsigned const_index = nir_const_index ? nir_const_index->u32 : 0;
2966    if (stride != 1) {
2967       if (nir_const_index) {
2968          const_index = const_index * stride;
2969       } else {
2970          index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), Operand(index));
2971       }
2972    }
2973    if (offset) {
2974       if (nir_const_index) {
2975          const_index = const_index + offset;
2976       } else {
2977          index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), Operand(index));
2978       }
2979    }
2980
2981    if (nir_const_index && const_index == 0) {
2982       index = desc_ptr;
2983    } else {
2984       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
2985                        nir_const_index ? Operand(const_index) : Operand(index),
2986                        Operand(desc_ptr));
2987    }
2988
2989    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
2990    bld.sop1(aco_opcode::s_mov_b32, Definition(dst), index);
2991 }
2992
2993 void load_buffer(isel_context *ctx, unsigned num_components, Temp dst, Temp rsrc, Temp offset, bool glc=false)
2994 {
2995    Builder bld(ctx->program, ctx->block);
2996
2997    unsigned num_bytes = dst.size() * 4;
2998
2999    aco_opcode op;
3000    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
3001       if (ctx->options->chip_class < GFX8)
3002          offset = as_vgpr(ctx, offset);
3003
3004       Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
3005       Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
3006       unsigned const_offset = 0;
3007
3008       Temp lower = Temp();
3009       if (num_bytes > 16) {
3010          assert(num_components == 3 || num_components == 4);
3011          op = aco_opcode::buffer_load_dwordx4;
3012          lower = bld.tmp(v4);
3013          aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3014          mubuf->definitions[0] = Definition(lower);
3015          mubuf->operands[0] = vaddr;
3016          mubuf->operands[1] = Operand(rsrc);
3017          mubuf->operands[2] = soffset;
3018          mubuf->offen = (offset.type() == RegType::vgpr);
3019          mubuf->glc = glc;
3020          mubuf->barrier = barrier_buffer;
3021          bld.insert(std::move(mubuf));
3022          emit_split_vector(ctx, lower, 2);
3023          num_bytes -= 16;
3024          const_offset = 16;
3025       }
3026
3027       switch (num_bytes) {
3028          case 4:
3029             op = aco_opcode::buffer_load_dword;
3030             break;
3031          case 8:
3032             op = aco_opcode::buffer_load_dwordx2;
3033             break;
3034          case 12:
3035             op = aco_opcode::buffer_load_dwordx3;
3036             break;
3037          case 16:
3038             op = aco_opcode::buffer_load_dwordx4;
3039             break;
3040          default:
3041             unreachable("Load SSBO not implemented for this size.");
3042       }
3043       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
3044       mubuf->operands[0] = vaddr;
3045       mubuf->operands[1] = Operand(rsrc);
3046       mubuf->operands[2] = soffset;
3047       mubuf->offen = (offset.type() == RegType::vgpr);
3048       mubuf->glc = glc;
3049       mubuf->barrier = barrier_buffer;
3050       mubuf->offset = const_offset;
3051       aco_ptr<Instruction> instr = std::move(mubuf);
3052
3053       if (dst.size() > 4) {
3054          assert(lower != Temp());
3055          Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
3056          instr->definitions[0] = Definition(upper);
3057          bld.insert(std::move(instr));
3058          if (dst.size() == 8)
3059             emit_split_vector(ctx, upper, 2);
3060          instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
3061          instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
3062          instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
3063          instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
3064          if (dst.size() == 8)
3065             instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
3066       }
3067
3068       if (dst.type() == RegType::sgpr) {
3069          Temp vec = bld.tmp(RegType::vgpr, dst.size());
3070          instr->definitions[0] = Definition(vec);
3071          bld.insert(std::move(instr));
3072          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
3073       } else {
3074          instr->definitions[0] = Definition(dst);
3075          bld.insert(std::move(instr));
3076       }
3077    } else {
3078       switch (num_bytes) {
3079          case 4:
3080             op = aco_opcode::s_buffer_load_dword;
3081             break;
3082          case 8:
3083             op = aco_opcode::s_buffer_load_dwordx2;
3084             break;
3085          case 12:
3086          case 16:
3087             op = aco_opcode::s_buffer_load_dwordx4;
3088             break;
3089          case 24:
3090          case 32:
3091             op = aco_opcode::s_buffer_load_dwordx8;
3092             break;
3093          default:
3094             unreachable("Load SSBO not implemented for this size.");
3095       }
3096       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
3097       load->operands[0] = Operand(rsrc);
3098       load->operands[1] = Operand(bld.as_uniform(offset));
3099       assert(load->operands[1].getTemp().type() == RegType::sgpr);
3100       load->definitions[0] = Definition(dst);
3101       load->glc = glc;
3102       load->barrier = barrier_buffer;
3103       assert(ctx->options->chip_class >= GFX8 || !glc);
3104
3105       /* trim vector */
3106       if (dst.size() == 3) {
3107          Temp vec = bld.tmp(s4);
3108          load->definitions[0] = Definition(vec);
3109          bld.insert(std::move(load));
3110          emit_split_vector(ctx, vec, 4);
3111
3112          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3113                     emit_extract_vector(ctx, vec, 0, s1),
3114                     emit_extract_vector(ctx, vec, 1, s1),
3115                     emit_extract_vector(ctx, vec, 2, s1));
3116       } else if (dst.size() == 6) {
3117          Temp vec = bld.tmp(s8);
3118          load->definitions[0] = Definition(vec);
3119          bld.insert(std::move(load));
3120          emit_split_vector(ctx, vec, 4);
3121
3122          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3123                     emit_extract_vector(ctx, vec, 0, s2),
3124                     emit_extract_vector(ctx, vec, 1, s2),
3125                     emit_extract_vector(ctx, vec, 2, s2));
3126       } else {
3127          bld.insert(std::move(load));
3128       }
3129
3130    }
3131    emit_split_vector(ctx, dst, num_components);
3132 }
3133
3134 void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
3135 {
3136    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3137    Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa);
3138
3139    Builder bld(ctx->program, ctx->block);
3140
3141    nir_intrinsic_instr* idx_instr = nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3142    unsigned desc_set = nir_intrinsic_desc_set(idx_instr);
3143    unsigned binding = nir_intrinsic_binding(idx_instr);
3144    radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3145
3146    if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT) {
3147       uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3148                            S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3149                            S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3150                            S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
3151                            S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3152                            S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3153       Temp upper_dwords = bld.pseudo(aco_opcode::p_create_vector, bld.def(s3),
3154                                      Operand(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)),
3155                                      Operand(0xFFFFFFFFu),
3156                                      Operand(desc_type));
3157       rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3158                         rsrc, upper_dwords);
3159    } else {
3160       rsrc = convert_pointer_to_64_bit(ctx, rsrc);
3161       rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
3162    }
3163
3164    load_buffer(ctx, instr->num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa));
3165 }
3166
3167 void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3168 {
3169    Builder bld(ctx->program, ctx->block);
3170    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3171
3172    unsigned offset = nir_intrinsic_base(instr);
3173    nir_const_value *index_cv = nir_src_as_const_value(instr->src[0]);
3174    if (index_cv && instr->dest.ssa.bit_size == 32) {
3175
3176       unsigned count = instr->dest.ssa.num_components;
3177       unsigned start = (offset + index_cv->u32) / 4u;
3178       start -= ctx->base_inline_push_consts;
3179       if (start + count <= ctx->num_inline_push_consts) {
3180          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
3181          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
3182          for (unsigned i = 0; i < count; ++i) {
3183             elems[i] = ctx->inline_push_consts[start + i];
3184             vec->operands[i] = Operand{elems[i]};
3185          }
3186          vec->definitions[0] = Definition(dst);
3187          ctx->block->instructions.emplace_back(std::move(vec));
3188          ctx->allocated_vec.emplace(dst.id(), elems);
3189          return;
3190       }
3191    }
3192
3193    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
3194    if (offset != 0) // TODO check if index != 0 as well
3195       index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
3196    Temp ptr = convert_pointer_to_64_bit(ctx, ctx->push_constants);
3197    Temp vec = dst;
3198    bool trim = false;
3199    aco_opcode op;
3200
3201    switch (dst.size()) {
3202    case 1:
3203       op = aco_opcode::s_load_dword;
3204       break;
3205    case 2:
3206       op = aco_opcode::s_load_dwordx2;
3207       break;
3208    case 3:
3209       vec = bld.tmp(s4);
3210       trim = true;
3211    case 4:
3212       op = aco_opcode::s_load_dwordx4;
3213       break;
3214    case 6:
3215       vec = bld.tmp(s8);
3216       trim = true;
3217    case 8:
3218       op = aco_opcode::s_load_dwordx8;
3219       break;
3220    default:
3221       unreachable("unimplemented or forbidden load_push_constant.");
3222    }
3223
3224    bld.smem(op, Definition(vec), ptr, index);
3225
3226    if (trim) {
3227       emit_split_vector(ctx, vec, 4);
3228       RegClass rc = dst.size() == 3 ? s1 : s2;
3229       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
3230                  emit_extract_vector(ctx, vec, 0, rc),
3231                  emit_extract_vector(ctx, vec, 1, rc),
3232                  emit_extract_vector(ctx, vec, 2, rc));
3233
3234    }
3235    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
3236 }
3237
3238 void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
3239 {
3240    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3241
3242    Builder bld(ctx->program, ctx->block);
3243
3244    uint32_t desc_type = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
3245                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
3246                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
3247                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W);
3248    if (ctx->options->chip_class >= GFX10) {
3249       desc_type |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
3250                    S_008F0C_OOB_SELECT(3) |
3251                    S_008F0C_RESOURCE_LEVEL(1);
3252    } else {
3253       desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
3254                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
3255    }
3256
3257    unsigned base = nir_intrinsic_base(instr) + ctx->constant_data_offset;
3258    unsigned range = nir_intrinsic_range(instr);
3259
3260    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
3261    if (base && offset.type() == RegType::sgpr)
3262       offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
3263    else if (base && offset.type() == RegType::vgpr)
3264       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
3265
3266    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3267                           bld.sop1(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), Operand(0u)),
3268                           Operand(MIN2(range, ctx->shader->constant_data_size - nir_intrinsic_base(instr))),
3269                           Operand(desc_type));
3270
3271    load_buffer(ctx, instr->num_components, dst, rsrc, offset);
3272 }
3273
3274 void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
3275 {
3276    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3277       ctx->cf_info.exec_potentially_empty = true;
3278
3279    ctx->program->needs_exact = true;
3280
3281    Builder bld(ctx->program, ctx->block);
3282    Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
3283    src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
3284    bld.pseudo(aco_opcode::p_discard_if, src);
3285    ctx->block->kind |= block_kind_uses_discard_if;
3286    return;
3287 }
3288
3289 void visit_discard(isel_context* ctx, nir_intrinsic_instr *instr)
3290 {
3291    Builder bld(ctx->program, ctx->block);
3292
3293    if (ctx->cf_info.loop_nest_depth || ctx->cf_info.parent_if.is_divergent)
3294       ctx->cf_info.exec_potentially_empty = true;
3295
3296    bool divergent = ctx->cf_info.parent_if.is_divergent ||
3297                     ctx->cf_info.parent_loop.has_divergent_continue;
3298
3299    if (ctx->block->loop_nest_depth &&
3300        ((nir_instr_is_last(&instr->instr) && !divergent) || divergent)) {
3301       /* we handle discards the same way as jump instructions */
3302       append_logical_end(ctx->block);
3303
3304       /* in loops, discard behaves like break */
3305       Block *linear_target = ctx->cf_info.parent_loop.exit;
3306       ctx->block->kind |= block_kind_discard;
3307
3308       if (!divergent) {
3309          /* uniform discard - loop ends here */
3310          assert(nir_instr_is_last(&instr->instr));
3311          ctx->block->kind |= block_kind_uniform;
3312          ctx->cf_info.has_branch = true;
3313          bld.branch(aco_opcode::p_branch);
3314          add_linear_edge(ctx->block->index, linear_target);
3315          return;
3316       }
3317
3318       /* we add a break right behind the discard() instructions */
3319       ctx->block->kind |= block_kind_break;
3320       unsigned idx = ctx->block->index;
3321
3322       /* remove critical edges from linear CFG */
3323       bld.branch(aco_opcode::p_branch);
3324       Block* break_block = ctx->program->create_and_insert_block();
3325       break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3326       break_block->kind |= block_kind_uniform;
3327       add_linear_edge(idx, break_block);
3328       add_linear_edge(break_block->index, linear_target);
3329       bld.reset(break_block);
3330       bld.branch(aco_opcode::p_branch);
3331
3332       Block* continue_block = ctx->program->create_and_insert_block();
3333       continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
3334       add_linear_edge(idx, continue_block);
3335       append_logical_start(continue_block);
3336       ctx->block = continue_block;
3337
3338       return;
3339    }
3340
3341    /* it can currently happen that NIR doesn't remove the unreachable code */
3342    if (!nir_instr_is_last(&instr->instr)) {
3343       ctx->program->needs_exact = true;
3344       /* save exec somewhere temporarily so that it doesn't get
3345        * overwritten before the discard from outer exec masks */
3346       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), Operand(0xFFFFFFFF), Operand(exec, s2));
3347       bld.pseudo(aco_opcode::p_discard_if, cond);
3348       ctx->block->kind |= block_kind_uses_discard_if;
3349       return;
3350    }
3351
3352    /* This condition is incorrect for uniformly branched discards in a loop
3353     * predicated by a divergent condition, but the above code catches that case
3354     * and the discard would end up turning into a discard_if.
3355     * For example:
3356     * if (divergent) {
3357     *    while (...) {
3358     *       if (uniform) {
3359     *          discard;
3360     *       }
3361     *    }
3362     * }
3363     */
3364    if (!ctx->cf_info.parent_if.is_divergent) {
3365       /* program just ends here */
3366       ctx->block->kind |= block_kind_uniform;
3367       bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1),
3368               0 /* enabled mask */, 9 /* dest */,
3369               false /* compressed */, true/* done */, true /* valid mask */);
3370       bld.sopp(aco_opcode::s_endpgm);
3371       // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis
3372    } else {
3373       ctx->block->kind |= block_kind_discard;
3374       /* branch and linear edge is added by visit_if() */
3375    }
3376 }
3377
3378 enum aco_descriptor_type {
3379    ACO_DESC_IMAGE,
3380    ACO_DESC_FMASK,
3381    ACO_DESC_SAMPLER,
3382    ACO_DESC_BUFFER,
3383    ACO_DESC_PLANE_0,
3384    ACO_DESC_PLANE_1,
3385    ACO_DESC_PLANE_2,
3386 };
3387
3388 enum aco_image_dim {
3389    aco_image_1d,
3390    aco_image_2d,
3391    aco_image_3d,
3392    aco_image_cube, // includes cube arrays
3393    aco_image_1darray,
3394    aco_image_2darray,
3395    aco_image_2dmsaa,
3396    aco_image_2darraymsaa,
3397 };
3398
3399 static enum aco_image_dim
3400 get_sampler_dim(isel_context *ctx, enum glsl_sampler_dim dim, bool is_array)
3401 {
3402    switch (dim) {
3403    case GLSL_SAMPLER_DIM_1D:
3404       if (ctx->options->chip_class >= GFX9)
3405          return is_array ? aco_image_2darray : aco_image_2d;
3406       return is_array ? aco_image_1darray : aco_image_1d;
3407    case GLSL_SAMPLER_DIM_2D:
3408    case GLSL_SAMPLER_DIM_RECT:
3409    case GLSL_SAMPLER_DIM_EXTERNAL:
3410       return is_array ? aco_image_2darray : aco_image_2d;
3411    case GLSL_SAMPLER_DIM_3D:
3412       return aco_image_3d;
3413    case GLSL_SAMPLER_DIM_CUBE:
3414       return aco_image_cube;
3415    case GLSL_SAMPLER_DIM_MS:
3416       return is_array ? aco_image_2darraymsaa : aco_image_2dmsaa;
3417    case GLSL_SAMPLER_DIM_SUBPASS:
3418       return aco_image_2darray;
3419    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3420       return aco_image_2darraymsaa;
3421    default:
3422       unreachable("bad sampler dim");
3423    }
3424 }
3425
3426 static bool
3427 should_declare_array(isel_context *ctx, enum glsl_sampler_dim sampler_dim, bool is_array) {
3428    if (sampler_dim == GLSL_SAMPLER_DIM_BUF)
3429       return false;
3430    aco_image_dim dim = get_sampler_dim(ctx, sampler_dim, is_array);
3431    return dim == aco_image_cube ||
3432           dim == aco_image_1darray ||
3433           dim == aco_image_2darray ||
3434           dim == aco_image_2darraymsaa;
3435 }
3436
3437 Temp get_sampler_desc(isel_context *ctx, nir_deref_instr *deref_instr,
3438                       enum aco_descriptor_type desc_type,
3439                       const nir_tex_instr *tex_instr, bool image, bool write)
3440 {
3441 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc
3442    std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 32 | deref_instr->dest.ssa.index);
3443    if (it != ctx->tex_desc.end())
3444       return it->second;
3445 */
3446    Temp index = Temp();
3447    bool index_set = false;
3448    unsigned constant_index = 0;
3449    unsigned descriptor_set;
3450    unsigned base_index;
3451    Builder bld(ctx->program, ctx->block);
3452
3453    if (!deref_instr) {
3454       assert(tex_instr && !image);
3455       descriptor_set = 0;
3456       base_index = tex_instr->sampler_index;
3457    } else {
3458       while(deref_instr->deref_type != nir_deref_type_var) {
3459          unsigned array_size = glsl_get_aoa_size(deref_instr->type);
3460          if (!array_size)
3461             array_size = 1;
3462
3463          assert(deref_instr->deref_type == nir_deref_type_array);
3464          nir_const_value *const_value = nir_src_as_const_value(deref_instr->arr.index);
3465          if (const_value) {
3466             constant_index += array_size * const_value->u32;
3467          } else {
3468             Temp indirect = bld.as_uniform(get_ssa_temp(ctx, deref_instr->arr.index.ssa));
3469
3470             if (array_size != 1)
3471                indirect = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(array_size), indirect);
3472
3473             if (!index_set) {
3474                index = indirect;
3475                index_set = true;
3476             } else {
3477                index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect);
3478             }
3479          }
3480
3481          deref_instr = nir_src_as_deref(deref_instr->parent);
3482       }
3483       descriptor_set = deref_instr->var->data.descriptor_set;
3484       base_index = deref_instr->var->data.binding;
3485    }
3486
3487    Temp list = load_desc_ptr(ctx, descriptor_set);
3488    list = convert_pointer_to_64_bit(ctx, list);
3489
3490    struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
3491    struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
3492    unsigned offset = binding->offset;
3493    unsigned stride = binding->size;
3494    aco_opcode opcode;
3495    RegClass type;
3496
3497    assert(base_index < layout->binding_count);
3498
3499    switch (desc_type) {
3500    case ACO_DESC_IMAGE:
3501       type = s8;
3502       opcode = aco_opcode::s_load_dwordx8;
3503       break;
3504    case ACO_DESC_FMASK:
3505       type = s8;
3506       opcode = aco_opcode::s_load_dwordx8;
3507       offset += 32;
3508       break;
3509    case ACO_DESC_SAMPLER:
3510       type = s4;
3511       opcode = aco_opcode::s_load_dwordx4;
3512       if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3513          offset += radv_combined_image_descriptor_sampler_offset(binding);
3514       break;
3515    case ACO_DESC_BUFFER:
3516       type = s4;
3517       opcode = aco_opcode::s_load_dwordx4;
3518       break;
3519    case ACO_DESC_PLANE_0:
3520    case ACO_DESC_PLANE_1:
3521       type = s8;
3522       opcode = aco_opcode::s_load_dwordx8;
3523       offset += 32 * (desc_type - ACO_DESC_PLANE_0);
3524       break;
3525    case ACO_DESC_PLANE_2:
3526       type = s4;
3527       opcode = aco_opcode::s_load_dwordx4;
3528       offset += 64;
3529       break;
3530    default:
3531       unreachable("invalid desc_type\n");
3532    }
3533
3534    offset += constant_index * stride;
3535
3536    if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset &&
3537       (!index_set || binding->immutable_samplers_equal)) {
3538       if (binding->immutable_samplers_equal)
3539          constant_index = 0;
3540
3541       const uint32_t *samplers = radv_immutable_samplers(layout, binding);
3542       return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4),
3543                         Operand(samplers[constant_index * 4 + 0]),
3544                         Operand(samplers[constant_index * 4 + 1]),
3545                         Operand(samplers[constant_index * 4 + 2]),
3546                         Operand(samplers[constant_index * 4 + 3]));
3547    }
3548
3549    Operand off;
3550    if (!index_set) {
3551       off = Operand(offset);
3552    } else {
3553       off = Operand((Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset),
3554                                    bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(stride), index)));
3555    }
3556
3557    Temp res = bld.smem(opcode, bld.def(type), list, off);
3558
3559    if (desc_type == ACO_DESC_PLANE_2) {
3560       Temp components[8];
3561       for (unsigned i = 0; i < 8; i++)
3562          components[i] = bld.tmp(s1);
3563       bld.pseudo(aco_opcode::p_split_vector,
3564                  Definition(components[0]),
3565                  Definition(components[1]),
3566                  Definition(components[2]),
3567                  Definition(components[3]),
3568                  res);
3569
3570       Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, image, write);
3571       bld.pseudo(aco_opcode::p_split_vector,
3572                  bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1),
3573                  Definition(components[4]),
3574                  Definition(components[5]),
3575                  Definition(components[6]),
3576                  Definition(components[7]),
3577                  desc2);
3578
3579       res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8),
3580                        components[0], components[1], components[2], components[3],
3581                        components[4], components[5], components[6], components[7]);
3582    }
3583
3584    return res;
3585 }
3586
3587 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3588 {
3589    switch (dim) {
3590    case GLSL_SAMPLER_DIM_BUF:
3591       return 1;
3592    case GLSL_SAMPLER_DIM_1D:
3593       return array ? 2 : 1;
3594    case GLSL_SAMPLER_DIM_2D:
3595       return array ? 3 : 2;
3596    case GLSL_SAMPLER_DIM_MS:
3597       return array ? 4 : 3;
3598    case GLSL_SAMPLER_DIM_3D:
3599    case GLSL_SAMPLER_DIM_CUBE:
3600       return 3;
3601    case GLSL_SAMPLER_DIM_RECT:
3602    case GLSL_SAMPLER_DIM_SUBPASS:
3603       return 2;
3604    case GLSL_SAMPLER_DIM_SUBPASS_MS:
3605       return 3;
3606    default:
3607       break;
3608    }
3609    return 0;
3610 }
3611
3612
3613 /* Adjust the sample index according to FMASK.
3614  *
3615  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3616  * which is the identity mapping. Each nibble says which physical sample
3617  * should be fetched to get that sample.
3618  *
3619  * For example, 0x11111100 means there are only 2 samples stored and
3620  * the second sample covers 3/4 of the pixel. When reading samples 0
3621  * and 1, return physical sample 0 (determined by the first two 0s
3622  * in FMASK), otherwise return physical sample 1.
3623  *
3624  * The sample index should be adjusted as follows:
3625  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
3626  */
3627 static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, Temp coords, Operand sample_index, Temp fmask_desc_ptr)
3628 {
3629    Builder bld(ctx->program, ctx->block);
3630    Temp fmask = bld.tmp(v1);
3631
3632    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3633    load->operands[0] = Operand(coords);
3634    load->operands[1] = Operand(fmask_desc_ptr);
3635    load->definitions[0] = Definition(fmask);
3636    load->glc = false;
3637    load->dmask = 0x1;
3638    load->unrm = true;
3639    load->da = da;
3640    load->can_reorder = true; /* fmask images shouldn't be modified */
3641    ctx->block->instructions.emplace_back(std::move(load));
3642
3643    Operand sample_index4;
3644    if (sample_index.isConstant() && sample_index.constantValue() < 16) {
3645       sample_index4 = Operand(sample_index.constantValue() << 2);
3646    } else if (sample_index.regClass() == s1) {
3647       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
3648    } else {
3649       assert(sample_index.regClass() == v1);
3650       sample_index4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), sample_index);
3651    }
3652
3653    Temp final_sample;
3654    if (sample_index4.isConstant() && sample_index4.constantValue() == 0)
3655       final_sample = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(15u), fmask);
3656    else if (sample_index4.isConstant() && sample_index4.constantValue() == 28)
3657       final_sample = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(28u), fmask);
3658    else
3659       final_sample = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), fmask, sample_index4, Operand(4u));
3660
3661    /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3662     * resource descriptor is 0 (invalid),
3663     */
3664    Temp compare = bld.tmp(s2);
3665    bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(compare),
3666                 Operand(0u), emit_extract_vector(ctx, fmask_desc_ptr, 1, s1)).def(0).setHint(vcc);
3667
3668    Temp sample_index_v = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), sample_index);
3669
3670    /* Replace the MSAA sample index. */
3671    return bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), sample_index_v, final_sample, compare);
3672 }
3673
3674 static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr, const struct glsl_type *type)
3675 {
3676
3677    Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa);
3678    enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3679    bool is_array = glsl_sampler_type_is_array(type);
3680    ASSERTED bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3681    assert(!add_frag_pos && "Input attachments should be lowered.");
3682    bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3683    bool gfx9_1d = ctx->options->chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3684    int count = image_type_to_components_count(dim, is_array);
3685    std::vector<Operand> coords(count);
3686
3687    if (is_ms) {
3688       Operand sample_index;
3689       nir_const_value *sample_cv = nir_src_as_const_value(instr->src[2]);
3690       if (sample_cv)
3691          sample_index = Operand(sample_cv->u32);
3692       else
3693          sample_index = Operand(emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1));
3694
3695       if (instr->intrinsic == nir_intrinsic_image_deref_load) {
3696          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, is_array ? 3 : 2, 1)};
3697          for (unsigned i = 0; i < vec->operands.size(); i++)
3698             vec->operands[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3699          Temp fmask_load_address = {ctx->program->allocateId(), is_array ? v3 : v2};
3700          vec->definitions[0] = Definition(fmask_load_address);
3701          ctx->block->instructions.emplace_back(std::move(vec));
3702
3703          Temp fmask_desc_ptr = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_FMASK, nullptr, false, false);
3704          sample_index = Operand(adjust_sample_index_using_fmask(ctx, is_array, fmask_load_address, sample_index, fmask_desc_ptr));
3705       }
3706       count--;
3707       coords[count] = sample_index;
3708    }
3709
3710    if (count == 1 && !gfx9_1d)
3711       return emit_extract_vector(ctx, src0, 0, v1);
3712
3713    if (gfx9_1d) {
3714       coords[0] = Operand(emit_extract_vector(ctx, src0, 0, v1));
3715       coords.resize(coords.size() + 1);
3716       coords[1] = Operand((uint32_t) 0);
3717       if (is_array)
3718          coords[2] = Operand(emit_extract_vector(ctx, src0, 1, v1));
3719    } else {
3720       for (int i = 0; i < count; i++)
3721          coords[i] = Operand(emit_extract_vector(ctx, src0, i, v1));
3722    }
3723
3724    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
3725    for (unsigned i = 0; i < coords.size(); i++)
3726       vec->operands[i] = coords[i];
3727    Temp res = {ctx->program->allocateId(), RegClass(RegType::vgpr, coords.size())};
3728    vec->definitions[0] = Definition(res);
3729    ctx->block->instructions.emplace_back(std::move(vec));
3730    return res;
3731 }
3732
3733
3734 void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
3735 {
3736    Builder bld(ctx->program, ctx->block);
3737    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3738    const struct glsl_type *type = glsl_without_array(var->type);
3739    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3740    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3741
3742    if (dim == GLSL_SAMPLER_DIM_BUF) {
3743       unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
3744       unsigned num_channels = util_last_bit(mask);
3745       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3746       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3747
3748       aco_opcode opcode;
3749       switch (num_channels) {
3750       case 1:
3751          opcode = aco_opcode::buffer_load_format_x;
3752          break;
3753       case 2:
3754          opcode = aco_opcode::buffer_load_format_xy;
3755          break;
3756       case 3:
3757          opcode = aco_opcode::buffer_load_format_xyz;
3758          break;
3759       case 4:
3760          opcode = aco_opcode::buffer_load_format_xyzw;
3761          break;
3762       default:
3763          unreachable(">4 channel buffer image load");
3764       }
3765       aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3, 1)};
3766       load->operands[0] = Operand(vindex);
3767       load->operands[1] = Operand(rsrc);
3768       load->operands[2] = Operand((uint32_t) 0);
3769       Temp tmp;
3770       if (num_channels == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3771          tmp = dst;
3772       else
3773          tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
3774       load->definitions[0] = Definition(tmp);
3775       load->idxen = true;
3776       load->barrier = barrier_image;
3777       ctx->block->instructions.emplace_back(std::move(load));
3778
3779       expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
3780       return;
3781    }
3782
3783    Temp coords = get_image_coords(ctx, instr, type);
3784    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3785    //aco_image_dim img_dim = get_image_dim(ctx, glsl_get_sampler_dim(type), glsl_sampler_type_is_array(type));
3786
3787    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
3788    unsigned num_components = util_bitcount(dmask);
3789    Temp tmp;
3790    if (num_components == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
3791       tmp = dst;
3792    else
3793       tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_components)};
3794
3795    aco_ptr<MIMG_instruction> load{create_instruction<MIMG_instruction>(aco_opcode::image_load, Format::MIMG, 2, 1)};
3796    load->operands[0] = Operand(coords);
3797    load->operands[1] = Operand(resource);
3798    load->definitions[0] = Definition(tmp);
3799    load->glc = var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
3800    load->dmask = dmask;
3801    load->unrm = true;
3802    load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3803    load->barrier = barrier_image;
3804    ctx->block->instructions.emplace_back(std::move(load));
3805
3806    expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
3807    return;
3808 }
3809
3810 void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
3811 {
3812    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3813    const struct glsl_type *type = glsl_without_array(var->type);
3814    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3815    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3816
3817    bool glc = ctx->options->chip_class == GFX6 || var->data.image.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
3818
3819    if (dim == GLSL_SAMPLER_DIM_BUF) {
3820       Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3821       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3822       aco_opcode opcode;
3823       switch (data.size()) {
3824       case 1:
3825          opcode = aco_opcode::buffer_store_format_x;
3826          break;
3827       case 2:
3828          opcode = aco_opcode::buffer_store_format_xy;
3829          break;
3830       case 3:
3831          opcode = aco_opcode::buffer_store_format_xyz;
3832          break;
3833       case 4:
3834          opcode = aco_opcode::buffer_store_format_xyzw;
3835          break;
3836       default:
3837          unreachable(">4 channel buffer image store");
3838       }
3839       aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
3840       store->operands[0] = Operand(vindex);
3841       store->operands[1] = Operand(rsrc);
3842       store->operands[2] = Operand((uint32_t) 0);
3843       store->operands[3] = Operand(data);
3844       store->idxen = true;
3845       store->glc = glc;
3846       store->disable_wqm = true;
3847       store->barrier = barrier_image;
3848       ctx->program->needs_exact = true;
3849       ctx->block->instructions.emplace_back(std::move(store));
3850       return;
3851    }
3852
3853    assert(data.type() == RegType::vgpr);
3854    Temp coords = get_image_coords(ctx, instr, type);
3855    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3856
3857    aco_ptr<MIMG_instruction> store{create_instruction<MIMG_instruction>(aco_opcode::image_store, Format::MIMG, 4, 0)};
3858    store->operands[0] = Operand(coords);
3859    store->operands[1] = Operand(resource);
3860    store->operands[2] = Operand(s4);
3861    store->operands[3] = Operand(data);
3862    store->glc = glc;
3863    store->dmask = (1 << data.size()) - 1;
3864    store->unrm = true;
3865    store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3866    store->disable_wqm = true;
3867    store->barrier = barrier_image;
3868    ctx->program->needs_exact = true;
3869    ctx->block->instructions.emplace_back(std::move(store));
3870    return;
3871 }
3872
3873 void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
3874 {
3875    /* return the previous value if dest is ever used */
3876    bool return_previous = false;
3877    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
3878       return_previous = true;
3879       break;
3880    }
3881    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
3882       return_previous = true;
3883       break;
3884    }
3885
3886    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
3887    const struct glsl_type *type = glsl_without_array(var->type);
3888    const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3889    Builder bld(ctx->program, ctx->block);
3890
3891    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
3892    assert(data.size() == 1 && "64bit ssbo atomics not yet implemented.");
3893
3894    if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap)
3895       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), get_ssa_temp(ctx, instr->src[4].ssa), data);
3896
3897    aco_opcode buf_op, image_op;
3898    switch (instr->intrinsic) {
3899       case nir_intrinsic_image_deref_atomic_add:
3900          buf_op = aco_opcode::buffer_atomic_add;
3901          image_op = aco_opcode::image_atomic_add;
3902          break;
3903       case nir_intrinsic_image_deref_atomic_umin:
3904          buf_op = aco_opcode::buffer_atomic_umin;
3905          image_op = aco_opcode::image_atomic_umin;
3906          break;
3907       case nir_intrinsic_image_deref_atomic_imin:
3908          buf_op = aco_opcode::buffer_atomic_smin;
3909          image_op = aco_opcode::image_atomic_smin;
3910          break;
3911       case nir_intrinsic_image_deref_atomic_umax:
3912          buf_op = aco_opcode::buffer_atomic_umax;
3913          image_op = aco_opcode::image_atomic_umax;
3914          break;
3915       case nir_intrinsic_image_deref_atomic_imax:
3916          buf_op = aco_opcode::buffer_atomic_smax;
3917          image_op = aco_opcode::image_atomic_smax;
3918          break;
3919       case nir_intrinsic_image_deref_atomic_and:
3920          buf_op = aco_opcode::buffer_atomic_and;
3921          image_op = aco_opcode::image_atomic_and;
3922          break;
3923       case nir_intrinsic_image_deref_atomic_or:
3924          buf_op = aco_opcode::buffer_atomic_or;
3925          image_op = aco_opcode::image_atomic_or;
3926          break;
3927       case nir_intrinsic_image_deref_atomic_xor:
3928          buf_op = aco_opcode::buffer_atomic_xor;
3929          image_op = aco_opcode::image_atomic_xor;
3930          break;
3931       case nir_intrinsic_image_deref_atomic_exchange:
3932          buf_op = aco_opcode::buffer_atomic_swap;
3933          image_op = aco_opcode::image_atomic_swap;
3934          break;
3935       case nir_intrinsic_image_deref_atomic_comp_swap:
3936          buf_op = aco_opcode::buffer_atomic_cmpswap;
3937          image_op = aco_opcode::image_atomic_cmpswap;
3938          break;
3939       default:
3940          unreachable("visit_image_atomic should only be called with nir_intrinsic_image_deref_atomic_* instructions.");
3941    }
3942
3943    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
3944
3945    if (dim == GLSL_SAMPLER_DIM_BUF) {
3946       Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
3947       Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
3948       //assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet implemented.");
3949       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)};
3950       mubuf->operands[0] = Operand(vindex);
3951       mubuf->operands[1] = Operand(resource);
3952       mubuf->operands[2] = Operand((uint32_t)0);
3953       mubuf->operands[3] = Operand(data);
3954       if (return_previous)
3955          mubuf->definitions[0] = Definition(dst);
3956       mubuf->offset = 0;
3957       mubuf->idxen = true;
3958       mubuf->glc = return_previous;
3959       mubuf->disable_wqm = true;
3960       mubuf->barrier = barrier_image;
3961       ctx->program->needs_exact = true;
3962       ctx->block->instructions.emplace_back(std::move(mubuf));
3963       return;
3964    }
3965
3966    Temp coords = get_image_coords(ctx, instr, type);
3967    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, nullptr, true, true);
3968    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(image_op, Format::MIMG, 4, return_previous ? 1 : 0)};
3969    mimg->operands[0] = Operand(coords);
3970    mimg->operands[1] = Operand(resource);
3971    mimg->operands[2] = Operand(s4); /* no sampler */
3972    mimg->operands[3] = Operand(data);
3973    if (return_previous)
3974       mimg->definitions[0] = Definition(dst);
3975    mimg->glc = return_previous;
3976    mimg->dmask = (1 << data.size()) - 1;
3977    mimg->unrm = true;
3978    mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
3979    mimg->disable_wqm = true;
3980    mimg->barrier = barrier_image;
3981    ctx->program->needs_exact = true;
3982    ctx->block->instructions.emplace_back(std::move(mimg));
3983    return;
3984 }
3985
3986 void get_buffer_size(isel_context *ctx, Temp desc, Temp dst, bool in_elements)
3987 {
3988    if (in_elements && ctx->options->chip_class == GFX8) {
3989       Builder bld(ctx->program, ctx->block);
3990
3991       Temp stride = emit_extract_vector(ctx, desc, 1, s1);
3992       stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, Operand((5u << 16) | 16u));
3993       stride = bld.vop1(aco_opcode::v_cvt_f32_ubyte0, bld.def(v1), stride);
3994       stride = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), stride);
3995
3996       Temp size = emit_extract_vector(ctx, desc, 2, s1);
3997       size = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), size);
3998
3999       Temp res = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), size, stride);
4000       res = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), res);
4001       bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4002
4003       // TODO: we can probably calculate this faster on the scalar unit to do: size / stride{1,2,4,8,12,16}
4004       /* idea
4005        * for 1,2,4,8,16, the result is just (stride >> S_FF1_I32_B32)
4006        * in case 12 (or 3?), we have to divide by 3:
4007        * set v_skip in case it's 12 (if we also have to take care of 3, shift first)
4008        * use v_mul_hi_u32 with magic number to divide
4009        * we need some pseudo merge opcode to overwrite the original SALU result with readfirstlane
4010        * disable v_skip
4011        * total: 6 SALU + 2 VALU instructions vs 1 SALU + 6 VALU instructions
4012        */
4013
4014    } else {
4015       emit_extract_vector(ctx, desc, 2, dst);
4016    }
4017 }
4018
4019 void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
4020 {
4021    const nir_variable *var = nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr));
4022    const struct glsl_type *type = glsl_without_array(var->type);
4023    Builder bld(ctx->program, ctx->block);
4024
4025    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
4026       Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, NULL, true, false);
4027       return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), true);
4028    }
4029
4030    /* LOD */
4031    Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
4032
4033    /* Resource */
4034    Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_IMAGE, NULL, true, false);
4035
4036    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4037
4038    aco_ptr<MIMG_instruction> mimg{create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1)};
4039    mimg->operands[0] = Operand(lod);
4040    mimg->operands[1] = Operand(resource);
4041    unsigned& dmask = mimg->dmask;
4042    mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
4043    mimg->da = glsl_sampler_type_is_array(type);
4044    mimg->can_reorder = true;
4045    Definition& def = mimg->definitions[0];
4046    ctx->block->instructions.emplace_back(std::move(mimg));
4047
4048    if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
4049        glsl_sampler_type_is_array(type)) {
4050
4051       assert(instr->dest.ssa.num_components == 3);
4052       Temp tmp = {ctx->program->allocateId(), v3};
4053       def = Definition(tmp);
4054       emit_split_vector(ctx, tmp, 3);
4055
4056       /* divide 3rd value by 6 by multiplying with magic number */
4057       Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
4058       Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp, 2, v1), c);
4059
4060       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4061                  emit_extract_vector(ctx, tmp, 0, v1),
4062                  emit_extract_vector(ctx, tmp, 1, v1),
4063                  by_6);
4064
4065    } else if (ctx->options->chip_class >= GFX9 &&
4066               glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
4067               glsl_sampler_type_is_array(type)) {
4068       assert(instr->dest.ssa.num_components == 2);
4069       def = Definition(dst);
4070       dmask = 0x5;
4071    } else {
4072       def = Definition(dst);
4073    }
4074
4075    emit_split_vector(ctx, dst, instr->dest.ssa.num_components);
4076 }
4077
4078 void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4079 {
4080    Builder bld(ctx->program, ctx->block);
4081    unsigned num_components = instr->num_components;
4082
4083    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4084    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4085    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4086
4087    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4088    load_buffer(ctx, num_components, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), glc);
4089 }
4090
4091 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4092 {
4093    Builder bld(ctx->program, ctx->block);
4094    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4095    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4096    unsigned writemask = nir_intrinsic_write_mask(instr);
4097
4098    Temp offset;
4099    if (ctx->options->chip_class < GFX8)
4100       offset = as_vgpr(ctx,get_ssa_temp(ctx, instr->src[2].ssa));
4101    else
4102       offset = get_ssa_temp(ctx, instr->src[2].ssa);
4103
4104    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4105    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4106
4107    bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
4108                ctx->options->chip_class >= GFX8;
4109    if (smem)
4110       offset = bld.as_uniform(offset);
4111    bool smem_nonfs = smem && ctx->stage != fragment_fs;
4112
4113    while (writemask) {
4114       int start, count;
4115       u_bit_scan_consecutive_range(&writemask, &start, &count);
4116       if (count == 3 && smem) {
4117          writemask |= 1u << (start + 2);
4118          count = 2;
4119       }
4120       int num_bytes = count * elem_size_bytes;
4121
4122       if (num_bytes > 16) {
4123          assert(elem_size_bytes == 8);
4124          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4125          count = 2;
4126          num_bytes = 16;
4127       }
4128
4129       // TODO: check alignment of sub-dword stores
4130       // TODO: split 3 bytes. there is no store instruction for that
4131
4132       Temp write_data;
4133       if (count != instr->num_components) {
4134          emit_split_vector(ctx, data, instr->num_components);
4135          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4136          for (int i = 0; i < count; i++) {
4137             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
4138             vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
4139          }
4140          write_data = bld.tmp(smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
4141          vec->definitions[0] = Definition(write_data);
4142          ctx->block->instructions.emplace_back(std::move(vec));
4143       } else if (!smem && data.type() != RegType::vgpr) {
4144          assert(num_bytes % 4 == 0);
4145          write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
4146       } else if (smem_nonfs && data.type() == RegType::vgpr) {
4147          assert(num_bytes % 4 == 0);
4148          write_data = bld.as_uniform(data);
4149       } else {
4150          write_data = data;
4151       }
4152
4153       aco_opcode vmem_op, smem_op;
4154       switch (num_bytes) {
4155          case 4:
4156             vmem_op = aco_opcode::buffer_store_dword;
4157             smem_op = aco_opcode::s_buffer_store_dword;
4158             break;
4159          case 8:
4160             vmem_op = aco_opcode::buffer_store_dwordx2;
4161             smem_op = aco_opcode::s_buffer_store_dwordx2;
4162             break;
4163          case 12:
4164             vmem_op = aco_opcode::buffer_store_dwordx3;
4165             smem_op = aco_opcode::last_opcode;
4166             assert(!smem);
4167             break;
4168          case 16:
4169             vmem_op = aco_opcode::buffer_store_dwordx4;
4170             smem_op = aco_opcode::s_buffer_store_dwordx4;
4171             break;
4172          default:
4173             unreachable("Store SSBO not implemented for this size.");
4174       }
4175       if (ctx->stage == fragment_fs)
4176          smem_op = aco_opcode::p_fs_buffer_store_smem;
4177
4178       if (smem) {
4179          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
4180          store->operands[0] = Operand(rsrc);
4181          if (start) {
4182             Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
4183                                 offset, Operand(start * elem_size_bytes));
4184             store->operands[1] = Operand(off);
4185          } else {
4186             store->operands[1] = Operand(offset);
4187          }
4188          if (smem_op != aco_opcode::p_fs_buffer_store_smem)
4189             store->operands[1].setFixed(m0);
4190          store->operands[2] = Operand(write_data);
4191          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4192          store->disable_wqm = true;
4193          store->barrier = barrier_buffer;
4194          ctx->block->instructions.emplace_back(std::move(store));
4195          ctx->program->wb_smem_l1_on_end = true;
4196          if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
4197             ctx->block->kind |= block_kind_needs_lowering;
4198             ctx->program->needs_exact = true;
4199          }
4200       } else {
4201          aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
4202          store->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4203          store->operands[1] = Operand(rsrc);
4204          store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4205          store->operands[3] = Operand(write_data);
4206          store->offset = start * elem_size_bytes;
4207          store->offen = (offset.type() == RegType::vgpr);
4208          store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4209          store->disable_wqm = true;
4210          store->barrier = barrier_buffer;
4211          ctx->program->needs_exact = true;
4212          ctx->block->instructions.emplace_back(std::move(store));
4213       }
4214    }
4215 }
4216
4217 void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
4218 {
4219    /* return the previous value if dest is ever used */
4220    bool return_previous = false;
4221    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4222       return_previous = true;
4223       break;
4224    }
4225    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4226       return_previous = true;
4227       break;
4228    }
4229
4230    Builder bld(ctx->program, ctx->block);
4231    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa));
4232
4233    if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap)
4234       data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2),
4235                         get_ssa_temp(ctx, instr->src[3].ssa), data);
4236
4237    Temp offset;
4238    if (ctx->options->chip_class < GFX8)
4239       offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4240    else
4241       offset = get_ssa_temp(ctx, instr->src[1].ssa);
4242
4243    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4244    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
4245
4246    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4247
4248    aco_opcode op32, op64;
4249    switch (instr->intrinsic) {
4250       case nir_intrinsic_ssbo_atomic_add:
4251          op32 = aco_opcode::buffer_atomic_add;
4252          op64 = aco_opcode::buffer_atomic_add_x2;
4253          break;
4254       case nir_intrinsic_ssbo_atomic_imin:
4255          op32 = aco_opcode::buffer_atomic_smin;
4256          op64 = aco_opcode::buffer_atomic_smin_x2;
4257          break;
4258       case nir_intrinsic_ssbo_atomic_umin:
4259          op32 = aco_opcode::buffer_atomic_umin;
4260          op64 = aco_opcode::buffer_atomic_umin_x2;
4261          break;
4262       case nir_intrinsic_ssbo_atomic_imax:
4263          op32 = aco_opcode::buffer_atomic_smax;
4264          op64 = aco_opcode::buffer_atomic_smax_x2;
4265          break;
4266       case nir_intrinsic_ssbo_atomic_umax:
4267          op32 = aco_opcode::buffer_atomic_umax;
4268          op64 = aco_opcode::buffer_atomic_umax_x2;
4269          break;
4270       case nir_intrinsic_ssbo_atomic_and:
4271          op32 = aco_opcode::buffer_atomic_and;
4272          op64 = aco_opcode::buffer_atomic_and_x2;
4273          break;
4274       case nir_intrinsic_ssbo_atomic_or:
4275          op32 = aco_opcode::buffer_atomic_or;
4276          op64 = aco_opcode::buffer_atomic_or_x2;
4277          break;
4278       case nir_intrinsic_ssbo_atomic_xor:
4279          op32 = aco_opcode::buffer_atomic_xor;
4280          op64 = aco_opcode::buffer_atomic_xor_x2;
4281          break;
4282       case nir_intrinsic_ssbo_atomic_exchange:
4283          op32 = aco_opcode::buffer_atomic_swap;
4284          op64 = aco_opcode::buffer_atomic_swap_x2;
4285          break;
4286       case nir_intrinsic_ssbo_atomic_comp_swap:
4287          op32 = aco_opcode::buffer_atomic_cmpswap;
4288          op64 = aco_opcode::buffer_atomic_cmpswap_x2;
4289          break;
4290       default:
4291          unreachable("visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions.");
4292    }
4293    aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64;
4294    aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)};
4295    mubuf->operands[0] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
4296    mubuf->operands[1] = Operand(rsrc);
4297    mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
4298    mubuf->operands[3] = Operand(data);
4299    if (return_previous)
4300       mubuf->definitions[0] = Definition(dst);
4301    mubuf->offset = 0;
4302    mubuf->offen = (offset.type() == RegType::vgpr);
4303    mubuf->glc = return_previous;
4304    mubuf->disable_wqm = true;
4305    mubuf->barrier = barrier_buffer;
4306    ctx->program->needs_exact = true;
4307    ctx->block->instructions.emplace_back(std::move(mubuf));
4308 }
4309
4310 void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
4311
4312    Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4313    Builder bld(ctx->program, ctx->block);
4314    Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
4315    get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
4316 }
4317
4318 void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
4319 {
4320    Builder bld(ctx->program, ctx->block);
4321    unsigned num_components = instr->num_components;
4322    unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
4323
4324    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4325    Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
4326
4327    bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
4328    aco_opcode op;
4329    if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8)) {
4330       bool global = ctx->options->chip_class >= GFX9;
4331       aco_opcode op;
4332       switch (num_bytes) {
4333       case 4:
4334          op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
4335          break;
4336       case 8:
4337          op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
4338          break;
4339       case 12:
4340          op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
4341          break;
4342       case 16:
4343          op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
4344          break;
4345       default:
4346          unreachable("load_global not implemented for this size.");
4347       }
4348       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
4349       flat->operands[0] = Operand(addr);
4350       flat->operands[1] = Operand(s1);
4351       flat->glc = glc;
4352
4353       if (dst.type() == RegType::sgpr) {
4354          Temp vec = bld.tmp(RegType::vgpr, dst.size());
4355          flat->definitions[0] = Definition(vec);
4356          ctx->block->instructions.emplace_back(std::move(flat));
4357          bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
4358       } else {
4359          flat->definitions[0] = Definition(dst);
4360          ctx->block->instructions.emplace_back(std::move(flat));
4361       }
4362       emit_split_vector(ctx, dst, num_components);
4363    } else {
4364       switch (num_bytes) {
4365          case 4:
4366             op = aco_opcode::s_load_dword;
4367             break;
4368          case 8:
4369             op = aco_opcode::s_load_dwordx2;
4370             break;
4371          case 12:
4372          case 16:
4373             op = aco_opcode::s_load_dwordx4;
4374             break;
4375          default:
4376             unreachable("load_global not implemented for this size.");
4377       }
4378       aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
4379       load->operands[0] = Operand(addr);
4380       load->operands[1] = Operand(0u);
4381       load->definitions[0] = Definition(dst);
4382       load->glc = glc;
4383       load->barrier = barrier_buffer;
4384       assert(ctx->options->chip_class >= GFX8 || !glc);
4385
4386       if (dst.size() == 3) {
4387          /* trim vector */
4388          Temp vec = bld.tmp(s4);
4389          load->definitions[0] = Definition(vec);
4390          ctx->block->instructions.emplace_back(std::move(load));
4391          emit_split_vector(ctx, vec, 4);
4392
4393          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
4394                     emit_extract_vector(ctx, vec, 0, s1),
4395                     emit_extract_vector(ctx, vec, 1, s1),
4396                     emit_extract_vector(ctx, vec, 2, s1));
4397       } else {
4398          ctx->block->instructions.emplace_back(std::move(load));
4399       }
4400    }
4401 }
4402
4403 void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
4404 {
4405    Builder bld(ctx->program, ctx->block);
4406    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4407
4408    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4409    Temp addr = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4410
4411    unsigned writemask = nir_intrinsic_write_mask(instr);
4412    while (writemask) {
4413       int start, count;
4414       u_bit_scan_consecutive_range(&writemask, &start, &count);
4415       unsigned num_bytes = count * elem_size_bytes;
4416
4417       Temp write_data = data;
4418       if (count != instr->num_components) {
4419          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4420          for (int i = 0; i < count; i++)
4421             vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
4422          write_data = bld.tmp(RegType::vgpr, count);
4423          vec->definitions[0] = Definition(write_data);
4424          ctx->block->instructions.emplace_back(std::move(vec));
4425       }
4426
4427       unsigned offset = start * elem_size_bytes;
4428       if (offset > 0 && ctx->options->chip_class < GFX9) {
4429          Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
4430          Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
4431          Temp carry = bld.tmp(s2);
4432          bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr);
4433
4434          bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), bld.hint_vcc(Definition(carry)),
4435                   Operand(offset), addr0);
4436          bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(s2),
4437                   Operand(0u), addr1,
4438                   carry).def(1).setHint(vcc);
4439
4440          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
4441
4442          offset = 0;
4443       }
4444
4445       bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
4446       bool global = ctx->options->chip_class >= GFX9;
4447       aco_opcode op;
4448       switch (num_bytes) {
4449       case 4:
4450          op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
4451          break;
4452       case 8:
4453          op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2;
4454          break;
4455       case 12:
4456          op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3;
4457          break;
4458       case 16:
4459          op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4;
4460          break;
4461       default:
4462          unreachable("store_global not implemented for this size.");
4463       }
4464       aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
4465       flat->operands[0] = Operand(addr);
4466       flat->operands[1] = Operand(s1);
4467       flat->operands[2] = Operand(data);
4468       flat->glc = glc;
4469       flat->offset = offset;
4470       ctx->block->instructions.emplace_back(std::move(flat));
4471    }
4472 }
4473
4474 void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
4475    Builder bld(ctx->program, ctx->block);
4476    switch(instr->intrinsic) {
4477       case nir_intrinsic_group_memory_barrier:
4478       case nir_intrinsic_memory_barrier:
4479          bld.barrier(aco_opcode::p_memory_barrier_all);
4480          break;
4481       case nir_intrinsic_memory_barrier_atomic_counter:
4482          bld.barrier(aco_opcode::p_memory_barrier_atomic);
4483          break;
4484       case nir_intrinsic_memory_barrier_buffer:
4485          bld.barrier(aco_opcode::p_memory_barrier_buffer);
4486          break;
4487       case nir_intrinsic_memory_barrier_image:
4488          bld.barrier(aco_opcode::p_memory_barrier_image);
4489          break;
4490       case nir_intrinsic_memory_barrier_shared:
4491          bld.barrier(aco_opcode::p_memory_barrier_shared);
4492          break;
4493       default:
4494          unreachable("Unimplemented memory barrier intrinsic");
4495          break;
4496    }
4497 }
4498
4499 Operand load_lds_size_m0(isel_context *ctx)
4500 {
4501    /* TODO: m0 does not need to be initialized on GFX9+ */
4502    Builder bld(ctx->program, ctx->block);
4503    return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
4504 }
4505
4506
4507 void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4508 {
4509    // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
4510    Operand m = load_lds_size_m0(ctx);
4511    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4512    assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
4513    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4514    Builder bld(ctx->program, ctx->block);
4515
4516    unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8;
4517    unsigned bytes_read = 0;
4518    unsigned result_size = 0;
4519    unsigned total_bytes = instr->num_components * elem_size_bytes;
4520    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : instr->dest.ssa.bit_size / 8;
4521    std::array<Temp, 4> result;
4522
4523    while (bytes_read < total_bytes) {
4524       unsigned todo = total_bytes - bytes_read;
4525       bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
4526       bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
4527
4528       aco_opcode op = aco_opcode::last_opcode;
4529       if (todo >= 16 && aligned16) {
4530          op = aco_opcode::ds_read_b128;
4531          todo = 16;
4532       } else if (todo >= 12 && aligned16) {
4533          op = aco_opcode::ds_read_b96;
4534          todo = 12;
4535       } else if (todo >= 8) {
4536          op = aligned8 ? aco_opcode::ds_read_b64 : aco_opcode::ds_read2_b32;
4537          todo = 8;
4538       } else if (todo >= 4) {
4539          op = aco_opcode::ds_read_b32;
4540          todo = 4;
4541       } else {
4542          assert(false);
4543       }
4544       assert(todo % elem_size_bytes == 0);
4545       unsigned num_elements = todo / elem_size_bytes;
4546       unsigned offset = nir_intrinsic_base(instr) + bytes_read;
4547       unsigned max_offset = op == aco_opcode::ds_read2_b32 ? 1019 : 65535;
4548
4549       Temp address_offset = address;
4550       if (offset > max_offset) {
4551          address_offset = bld.vadd32(bld.def(v1), Operand((uint32_t)nir_intrinsic_base(instr)), address_offset);
4552          offset = bytes_read;
4553       }
4554       assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
4555
4556       Temp res;
4557       if (instr->num_components == 1 && dst.type() == RegType::vgpr)
4558          res = dst;
4559       else
4560          res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
4561
4562       if (op == aco_opcode::ds_read2_b32)
4563          res = bld.ds(op, Definition(res), address_offset, m, offset >> 2, (offset >> 2) + 1);
4564       else
4565          res = bld.ds(op, Definition(res), address_offset, m, offset);
4566
4567       if (instr->num_components == 1) {
4568          assert(todo == total_bytes);
4569          if (dst.type() == RegType::sgpr)
4570             bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
4571          return;
4572       }
4573
4574       if (dst.type() == RegType::sgpr)
4575          res = bld.as_uniform(res);
4576
4577       if (num_elements == 1) {
4578          result[result_size++] = res;
4579       } else {
4580          assert(res != dst && res.size() % num_elements == 0);
4581          aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
4582          split->operands[0] = Operand(res);
4583          for (unsigned i = 0; i < num_elements; i++)
4584             split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
4585          ctx->block->instructions.emplace_back(std::move(split));
4586       }
4587
4588       bytes_read += todo;
4589    }
4590
4591    assert(result_size == instr->num_components && result_size > 1);
4592    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
4593    for (unsigned i = 0; i < result_size; i++)
4594       vec->operands[i] = Operand(result[i]);
4595    vec->definitions[0] = Definition(dst);
4596    ctx->block->instructions.emplace_back(std::move(vec));
4597    ctx->allocated_vec.emplace(dst.id(), result);
4598 }
4599
4600 void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned offset0, unsigned offset1, unsigned align)
4601 {
4602    Builder bld(ctx->program, ctx->block);
4603    unsigned bytes_written = 0;
4604    while (bytes_written < data.size() * 4) {
4605       unsigned todo = data.size() * 4 - bytes_written;
4606       bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
4607       bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
4608
4609       aco_opcode op = aco_opcode::last_opcode;
4610       unsigned size = 0;
4611       if (todo >= 16 && aligned16) {
4612          op = aco_opcode::ds_write_b128;
4613          size = 4;
4614       } else if (todo >= 12 && aligned16) {
4615          op = aco_opcode::ds_write_b96;
4616          size = 3;
4617       } else if (todo >= 8) {
4618          op = aligned8 ? aco_opcode::ds_write_b64 : aco_opcode::ds_write2_b32;
4619          size = 2;
4620       } else if (todo >= 4) {
4621          op = aco_opcode::ds_write_b32;
4622          size = 1;
4623       } else {
4624          assert(false);
4625       }
4626
4627       bool write2 = op == aco_opcode::ds_write2_b32;
4628       unsigned offset = offset0 + offset1 + bytes_written;
4629       unsigned max_offset = write2 ? 1020 : 65535;
4630       Temp address_offset = address;
4631       if (offset > max_offset) {
4632          address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
4633          offset = offset1 + bytes_written;
4634       }
4635       assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
4636
4637       if (write2) {
4638          Temp val0 = emit_extract_vector(ctx, data, bytes_written >> 2, v1);
4639          Temp val1 = emit_extract_vector(ctx, data, (bytes_written >> 2) + 1, v1);
4640          bld.ds(op, address_offset, val0, val1, m, offset >> 2, (offset >> 2) + 1);
4641       } else {
4642          Temp val = emit_extract_vector(ctx, data, bytes_written >> 2, RegClass(RegType::vgpr, size));
4643          bld.ds(op, address_offset, val, m, offset);
4644       }
4645
4646       bytes_written += size * 4;
4647    }
4648 }
4649
4650 void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
4651 {
4652    unsigned offset = nir_intrinsic_base(instr);
4653    unsigned writemask = nir_intrinsic_write_mask(instr);
4654    Operand m = load_lds_size_m0(ctx);
4655    Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
4656    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4657    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4658    assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
4659
4660    /* we need at most two stores for 32bit variables */
4661    int start[2], count[2];
4662    u_bit_scan_consecutive_range(&writemask, &start[0], &count[0]);
4663    u_bit_scan_consecutive_range(&writemask, &start[1], &count[1]);
4664    assert(writemask == 0);
4665
4666    /* one combined store is sufficient */
4667    if (count[0] == count[1]) {
4668       Builder bld(ctx->program, ctx->block);
4669
4670       Temp address_offset = address;
4671       if ((offset >> 2) + start[1] > 255) {
4672          address_offset = bld.vadd32(bld.def(v1), Operand(offset), address_offset);
4673          offset = 0;
4674       }
4675
4676       assert(count[0] == 1);
4677       Temp val0 = emit_extract_vector(ctx, data, start[0], v1);
4678       Temp val1 = emit_extract_vector(ctx, data, start[1], v1);
4679       aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
4680       offset = offset / elem_size_bytes;
4681       bld.ds(op, address_offset, val0, val1, m,
4682              offset + start[0], offset + start[1]);
4683       return;
4684    }
4685
4686    unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
4687    for (unsigned i = 0; i < 2; i++) {
4688       if (count[i] == 0)
4689          continue;
4690
4691       Temp write_data = emit_extract_vector(ctx, data, start[i], RegClass(RegType::vgpr, count[i] * elem_size_bytes / 4));
4692       ds_write_helper(ctx, m, address, write_data, offset, start[i] * elem_size_bytes, align);
4693    }
4694    return;
4695 }
4696
4697 void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
4698 {
4699    unsigned offset = nir_intrinsic_base(instr);
4700    Operand m = load_lds_size_m0(ctx);
4701    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4702    Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4703
4704    unsigned num_operands = 3;
4705    aco_opcode op32, op64, op32_rtn, op64_rtn;
4706    switch(instr->intrinsic) {
4707       case nir_intrinsic_shared_atomic_add:
4708          op32 = aco_opcode::ds_add_u32;
4709          op64 = aco_opcode::ds_add_u64;
4710          op32_rtn = aco_opcode::ds_add_rtn_u32;
4711          op64_rtn = aco_opcode::ds_add_rtn_u64;
4712          break;
4713       case nir_intrinsic_shared_atomic_imin:
4714          op32 = aco_opcode::ds_min_i32;
4715          op64 = aco_opcode::ds_min_i64;
4716          op32_rtn = aco_opcode::ds_min_rtn_i32;
4717          op64_rtn = aco_opcode::ds_min_rtn_i64;
4718          break;
4719       case nir_intrinsic_shared_atomic_umin:
4720          op32 = aco_opcode::ds_min_u32;
4721          op64 = aco_opcode::ds_min_u64;
4722          op32_rtn = aco_opcode::ds_min_rtn_u32;
4723          op64_rtn = aco_opcode::ds_min_rtn_u64;
4724          break;
4725       case nir_intrinsic_shared_atomic_imax:
4726          op32 = aco_opcode::ds_max_i32;
4727          op64 = aco_opcode::ds_max_i64;
4728          op32_rtn = aco_opcode::ds_max_rtn_i32;
4729          op64_rtn = aco_opcode::ds_max_rtn_i64;
4730          break;
4731       case nir_intrinsic_shared_atomic_umax:
4732          op32 = aco_opcode::ds_max_u32;
4733          op64 = aco_opcode::ds_max_u64;
4734          op32_rtn = aco_opcode::ds_max_rtn_u32;
4735          op64_rtn = aco_opcode::ds_max_rtn_u64;
4736          break;
4737       case nir_intrinsic_shared_atomic_and:
4738          op32 = aco_opcode::ds_and_b32;
4739          op64 = aco_opcode::ds_and_b64;
4740          op32_rtn = aco_opcode::ds_and_rtn_b32;
4741          op64_rtn = aco_opcode::ds_and_rtn_b64;
4742          break;
4743       case nir_intrinsic_shared_atomic_or:
4744          op32 = aco_opcode::ds_or_b32;
4745          op64 = aco_opcode::ds_or_b64;
4746          op32_rtn = aco_opcode::ds_or_rtn_b32;
4747          op64_rtn = aco_opcode::ds_or_rtn_b64;
4748          break;
4749       case nir_intrinsic_shared_atomic_xor:
4750          op32 = aco_opcode::ds_xor_b32;
4751          op64 = aco_opcode::ds_xor_b64;
4752          op32_rtn = aco_opcode::ds_xor_rtn_b32;
4753          op64_rtn = aco_opcode::ds_xor_rtn_b64;
4754          break;
4755       case nir_intrinsic_shared_atomic_exchange:
4756          op32 = aco_opcode::ds_write_b32;
4757          op64 = aco_opcode::ds_write_b64;
4758          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
4759          op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
4760          break;
4761       case nir_intrinsic_shared_atomic_comp_swap:
4762          op32 = aco_opcode::ds_cmpst_b32;
4763          op64 = aco_opcode::ds_cmpst_b64;
4764          op32_rtn = aco_opcode::ds_cmpst_rtn_b32;
4765          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
4766          num_operands = 4;
4767          break;
4768       default:
4769          unreachable("Unhandled shared atomic intrinsic");
4770    }
4771
4772    /* return the previous value if dest is ever used */
4773    bool return_previous = false;
4774    nir_foreach_use_safe(use_src, &instr->dest.ssa) {
4775       return_previous = true;
4776       break;
4777    }
4778    nir_foreach_if_use_safe(use_src, &instr->dest.ssa) {
4779       return_previous = true;
4780       break;
4781    }
4782
4783    aco_opcode op;
4784    if (data.size() == 1) {
4785       assert(instr->dest.ssa.bit_size == 32);
4786       op = return_previous ? op32_rtn : op32;
4787    } else {
4788       assert(instr->dest.ssa.bit_size == 64);
4789       op = return_previous ? op64_rtn : op64;
4790    }
4791
4792    if (offset > 65535) {
4793       Builder bld(ctx->program, ctx->block);
4794       address = bld.vadd32(bld.def(v1), Operand(offset), address);
4795       offset = 0;
4796    }
4797
4798    aco_ptr<DS_instruction> ds;
4799    ds.reset(create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0));
4800    ds->operands[0] = Operand(address);
4801    ds->operands[1] = Operand(data);
4802    if (num_operands == 4)
4803       ds->operands[2] = Operand(get_ssa_temp(ctx, instr->src[2].ssa));
4804    ds->operands[num_operands - 1] = m;
4805    ds->offset0 = offset;
4806    if (return_previous)
4807       ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
4808    ctx->block->instructions.emplace_back(std::move(ds));
4809 }
4810
4811 void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4812    assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
4813    Builder bld(ctx->program, ctx->block);
4814    Temp scratch_addr = ctx->private_segment_buffer;
4815    if (ctx->stage != MESA_SHADER_COMPUTE)
4816       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
4817    uint32_t rsrc_conf;
4818    /* older generations need element size = 16 bytes */
4819    if (ctx->program->chip_class >= GFX9)
4820       rsrc_conf = 0x00E00000u;
4821    else
4822       rsrc_conf = 0x00F80000u;
4823    /* buffer res = addr + num_records = -1, index_stride = 64, add_tid_enable = true */
4824    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
4825    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4826    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4827
4828    aco_opcode op;
4829    switch (dst.size()) {
4830       case 1:
4831          op = aco_opcode::buffer_load_dword;
4832          break;
4833       case 2:
4834          op = aco_opcode::buffer_load_dwordx2;
4835          break;
4836       case 3:
4837          op = aco_opcode::buffer_load_dwordx3;
4838          break;
4839       case 4:
4840          op = aco_opcode::buffer_load_dwordx4;
4841          break;
4842       case 6:
4843       case 8: {
4844          std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
4845          Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
4846                                 bld.def(v4), offset, rsrc,
4847                                 ctx->scratch_offset, 0, true);
4848          Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
4849                                                   aco_opcode::buffer_load_dwordx4,
4850                                 dst.size() == 6 ? bld.def(v2) : bld.def(v4),
4851                                 offset, rsrc, ctx->scratch_offset, 16, true);
4852          emit_split_vector(ctx, lower, 2);
4853          elems[0] = emit_extract_vector(ctx, lower, 0, v2);
4854          elems[1] = emit_extract_vector(ctx, lower, 1, v2);
4855          if (dst.size() == 8) {
4856             emit_split_vector(ctx, upper, 2);
4857             elems[2] = emit_extract_vector(ctx, upper, 0, v2);
4858             elems[3] = emit_extract_vector(ctx, upper, 1, v2);
4859          } else {
4860             elems[2] = upper;
4861          }
4862
4863          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
4864                                                                          Format::PSEUDO, dst.size() / 2, 1)};
4865          for (unsigned i = 0; i < dst.size() / 2; i++)
4866             vec->operands[i] = Operand(elems[i]);
4867          vec->definitions[0] = Definition(dst);
4868          bld.insert(std::move(vec));
4869          ctx->allocated_vec.emplace(dst.id(), elems);
4870          return;
4871       }
4872       default:
4873          unreachable("Wrong dst size for nir_intrinsic_load_scratch");
4874    }
4875
4876    bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
4877    emit_split_vector(ctx, dst, instr->num_components);
4878 }
4879
4880 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
4881    assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
4882    Builder bld(ctx->program, ctx->block);
4883    Temp scratch_addr = ctx->private_segment_buffer;
4884    if (ctx->stage != MESA_SHADER_COMPUTE)
4885       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
4886    uint32_t rsrc_conf;
4887    /* older generations need element size = 16 bytes */
4888    if (ctx->program->chip_class >= GFX9)
4889       rsrc_conf = 0x00E00000u;
4890    else
4891       rsrc_conf = 0x00F80000u;
4892    /* buffer res = addr + num_records = -1, index_stride = 64, add_tid_enable = true */
4893    Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
4894    Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
4895    Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
4896
4897    unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
4898    unsigned writemask = nir_intrinsic_write_mask(instr);
4899
4900    while (writemask) {
4901       int start, count;
4902       u_bit_scan_consecutive_range(&writemask, &start, &count);
4903       int num_bytes = count * elem_size_bytes;
4904
4905       if (num_bytes > 16) {
4906          assert(elem_size_bytes == 8);
4907          writemask |= (((count - 2) << 1) - 1) << (start + 2);
4908          count = 2;
4909          num_bytes = 16;
4910       }
4911
4912       // TODO: check alignment of sub-dword stores
4913       // TODO: split 3 bytes. there is no store instruction for that
4914
4915       Temp write_data;
4916       if (count != instr->num_components) {
4917          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
4918          for (int i = 0; i < count; i++) {
4919             Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
4920             vec->operands[i] = Operand(elem);
4921          }
4922          write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
4923          vec->definitions[0] = Definition(write_data);
4924          ctx->block->instructions.emplace_back(std::move(vec));
4925       } else {
4926          write_data = data;
4927       }
4928
4929       aco_opcode op;
4930       switch (num_bytes) {
4931          case 4:
4932             op = aco_opcode::buffer_store_dword;
4933             break;
4934          case 8:
4935             op = aco_opcode::buffer_store_dwordx2;
4936             break;
4937          case 12:
4938             op = aco_opcode::buffer_store_dwordx3;
4939             break;
4940          case 16:
4941             op = aco_opcode::buffer_store_dwordx4;
4942             break;
4943          default:
4944             unreachable("Invalid data size for nir_intrinsic_store_scratch.");
4945       }
4946
4947       bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
4948    }
4949 }
4950
4951 void visit_load_sample_mask_in(isel_context *ctx, nir_intrinsic_instr *instr) {
4952    uint8_t log2_ps_iter_samples;
4953    if (ctx->program->info->ps.force_persample) {
4954       log2_ps_iter_samples =
4955          util_logbase2(ctx->options->key.fs.num_samples);
4956    } else {
4957       log2_ps_iter_samples = ctx->options->key.fs.log2_ps_iter_samples;
4958    }
4959
4960    /* The bit pattern matches that used by fixed function fragment
4961     * processing. */
4962    static const unsigned ps_iter_masks[] = {
4963       0xffff, /* not used */
4964       0x5555,
4965       0x1111,
4966       0x0101,
4967       0x0001,
4968    };
4969    assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
4970
4971    Builder bld(ctx->program, ctx->block);
4972
4973    Temp sample_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), ctx->fs_inputs[fs_input::ancillary], Operand(8u), Operand(4u));
4974    Temp ps_iter_mask = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(ps_iter_masks[log2_ps_iter_samples]));
4975    Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, ps_iter_mask);
4976    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
4977    bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, ctx->fs_inputs[fs_input::sample_coverage]);
4978 }
4979
4980 Temp emit_boolean_reduce(isel_context *ctx, nir_op op, unsigned cluster_size, Temp src)
4981 {
4982    Builder bld(ctx->program, ctx->block);
4983
4984    if (cluster_size == 1) {
4985       return src;
4986    } if (op == nir_op_iand && cluster_size == 4) {
4987       //subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val)
4988       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
4989       return bld.sop1(aco_opcode::s_not_b64, bld.def(s2), bld.def(s1, scc),
4990                       bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc), tmp));
4991    } else if (op == nir_op_ior && cluster_size == 4) {
4992       //subgroupClusteredOr(val, 4) -> wqm(val & exec)
4993       return bld.sop1(aco_opcode::s_wqm_b64, bld.def(s2), bld.def(s1, scc),
4994                       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)));
4995    } else if (op == nir_op_iand && cluster_size == 64) {
4996       //subgroupAnd(val) -> (exec & ~val) == 0
4997       Temp tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src).def(1).getTemp();
4998       return bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), tmp, Operand(0u));
4999    } else if (op == nir_op_ior && cluster_size == 64) {
5000       //subgroupOr(val) -> (val & exec) != 0
5001       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)).def(1).getTemp();
5002    } else if (op == nir_op_ixor && cluster_size == 64) {
5003       //subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1
5004       Temp tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5005       tmp = bld.sop1(aco_opcode::s_bcnt1_i32_b64, bld.def(s2), bld.def(s1, scc), tmp);
5006       return bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand(1u)).def(1).getTemp();
5007    } else {
5008       //subgroupClustered{And,Or,Xor}(val, n) ->
5009       //lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0))
5010       //cluster_offset = ~(n - 1) & lane_id
5011       //cluster_mask = ((1 << n) - 1)
5012       //subgroupClusteredAnd():
5013       //   return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask
5014       //subgroupClusteredOr():
5015       //   return ((val & exec) >> cluster_offset) & cluster_mask != 0
5016       //subgroupClusteredXor():
5017       //   return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0
5018       Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5019                               bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5020       Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(~uint32_t(cluster_size - 1)), lane_id);
5021
5022       Temp tmp;
5023       if (op == nir_op_iand)
5024          tmp = bld.sop2(aco_opcode::s_orn2_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5025       else
5026          tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5027
5028       uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u;
5029       tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp);
5030       tmp = emit_extract_vector(ctx, tmp, 0, v1);
5031       if (cluster_mask != 0xffffffff)
5032          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(cluster_mask), tmp);
5033
5034       Definition cmp_def = Definition();
5035       if (op == nir_op_iand) {
5036          cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(cluster_mask), tmp).def(0);
5037       } else if (op == nir_op_ior) {
5038          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5039       } else if (op == nir_op_ixor) {
5040          tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u),
5041                         bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand(0u)));
5042          cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp).def(0);
5043       }
5044       cmp_def.setHint(vcc);
5045       return cmp_def.getTemp();
5046    }
5047 }
5048
5049 Temp emit_boolean_exclusive_scan(isel_context *ctx, nir_op op, Temp src)
5050 {
5051    Builder bld(ctx->program, ctx->block);
5052
5053    //subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0
5054    //subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0
5055    //subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0
5056    Temp tmp;
5057    if (op == nir_op_iand)
5058       tmp = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc), Operand(exec, s2), src);
5059    else
5060       tmp = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
5061
5062    Builder::Result lohi = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), tmp);
5063    Temp lo = lohi.def(0).getTemp();
5064    Temp hi = lohi.def(1).getTemp();
5065    Temp mbcnt = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), hi,
5066                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), lo, Operand(0u)));
5067
5068    Definition cmp_def = Definition();
5069    if (op == nir_op_iand)
5070       cmp_def = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5071    else if (op == nir_op_ior)
5072       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), mbcnt).def(0);
5073    else if (op == nir_op_ixor)
5074       cmp_def = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u),
5075                          bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), mbcnt)).def(0);
5076    cmp_def.setHint(vcc);
5077    return cmp_def.getTemp();
5078 }
5079
5080 Temp emit_boolean_inclusive_scan(isel_context *ctx, nir_op op, Temp src)
5081 {
5082    Builder bld(ctx->program, ctx->block);
5083
5084    //subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val
5085    //subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val
5086    //subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val
5087    Temp tmp = emit_boolean_exclusive_scan(ctx, op, src);
5088    if (op == nir_op_iand)
5089       return bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5090    else if (op == nir_op_ior)
5091       return bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5092    else if (op == nir_op_ixor)
5093       return bld.sop2(aco_opcode::s_xor_b64, bld.def(s2), bld.def(s1, scc), tmp, src);
5094
5095    assert(false);
5096    return Temp();
5097 }
5098
5099 void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp src)
5100 {
5101    Builder bld(ctx->program, ctx->block);
5102    Definition dst(get_ssa_temp(ctx, &instr->dest.ssa));
5103    if (src.regClass().type() == RegType::vgpr) {
5104       bld.pseudo(aco_opcode::p_as_uniform, dst, src);
5105    } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5106       bld.sopc(aco_opcode::s_cmp_lg_u64, bld.scc(dst), Operand(0u), Operand(src));
5107    } else if (src.regClass() == s1) {
5108       bld.sop1(aco_opcode::s_mov_b32, dst, src);
5109    } else if (src.regClass() == s2) {
5110       bld.sop1(aco_opcode::s_mov_b64, dst, src);
5111    } else {
5112       fprintf(stderr, "Unimplemented NIR instr bit size: ");
5113       nir_print_instr(&instr->instr, stderr);
5114       fprintf(stderr, "\n");
5115    }
5116 }
5117
5118 void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
5119 {
5120    Builder bld(ctx->program, ctx->block);
5121    Temp p1 = ctx->fs_inputs[fs_input::persp_center_p1];
5122    Temp p2 = ctx->fs_inputs[fs_input::persp_center_p2];
5123
5124    /* Build DD X/Y */
5125    Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_quad_perm(0, 0, 0, 0));
5126    Temp ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(1, 1, 1, 1));
5127    Temp ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_quad_perm(2, 2, 2, 2));
5128    Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_quad_perm(0, 0, 0, 0));
5129    Temp ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(1, 1, 1, 1));
5130    Temp ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_quad_perm(2, 2, 2, 2));
5131
5132    /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
5133    Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
5134    Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
5135    tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
5136    tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
5137    Temp wqm1 = bld.tmp(v1);
5138    emit_wqm(ctx, tmp1, wqm1, true);
5139    Temp wqm2 = bld.tmp(v1);
5140    emit_wqm(ctx, tmp2, wqm2, true);
5141    bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2);
5142    return;
5143 }
5144
5145 void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
5146 {
5147    Builder bld(ctx->program, ctx->block);
5148    switch(instr->intrinsic) {
5149    case nir_intrinsic_load_barycentric_sample:
5150    case nir_intrinsic_load_barycentric_pixel:
5151    case nir_intrinsic_load_barycentric_centroid: {
5152       glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr);
5153       fs_input input = get_interp_input(instr->intrinsic, mode);
5154
5155       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5156       if (input == fs_input::max_inputs) {
5157          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5158                     Operand(0u), Operand(0u));
5159       } else {
5160          bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5161                     ctx->fs_inputs[input],
5162                     ctx->fs_inputs[input + 1]);
5163       }
5164       emit_split_vector(ctx, dst, 2);
5165       break;
5166    }
5167    case nir_intrinsic_load_barycentric_at_sample: {
5168       uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16;
5169       switch (ctx->options->key.fs.num_samples) {
5170          case 2: sample_pos_offset += 1 << 3; break;
5171          case 4: sample_pos_offset += 3 << 3; break;
5172          case 8: sample_pos_offset += 7 << 3; break;
5173          default: break;
5174       }
5175       Temp sample_pos;
5176       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
5177       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
5178       if (addr.type() == RegType::sgpr) {
5179          Operand offset;
5180          if (const_addr) {
5181             sample_pos_offset += const_addr->u32 << 3;
5182             offset = Operand(sample_pos_offset);
5183          } else if (ctx->options->chip_class >= GFX9) {
5184             offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5185          } else {
5186             offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
5187             offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
5188          }
5189          addr = ctx->private_segment_buffer;
5190          sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
5191
5192       } else if (ctx->options->chip_class >= GFX9) {
5193          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5194          sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
5195       } else {
5196          /* addr += ctx->private_segment_buffer + sample_pos_offset */
5197          Temp tmp0 = bld.tmp(s1);
5198          Temp tmp1 = bld.tmp(s1);
5199          bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
5200          Definition scc_tmp = bld.def(s1, scc);
5201          tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
5202          tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), scc_tmp.getTemp());
5203          addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
5204          Temp pck0 = bld.tmp(v1);
5205          Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp();
5206          tmp1 = as_vgpr(ctx, tmp1);
5207          Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), bld.hint_vcc(bld.def(s2)), tmp1, Operand(0u), carry);
5208          addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1);
5209
5210          /* sample_pos = flat_load_dwordx2 addr */
5211          sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1));
5212       }
5213
5214       /* sample_pos -= 0.5 */
5215       Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1));
5216       Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1));
5217       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos);
5218       pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand(0x3f000000u));
5219       pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand(0x3f000000u));
5220
5221       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5222       break;
5223    }
5224    case nir_intrinsic_load_barycentric_at_offset: {
5225       Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
5226       RegClass rc = RegClass(offset.type(), 1);
5227       Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc);
5228       bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset);
5229       emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), pos1, pos2);
5230       break;
5231    }
5232    case nir_intrinsic_load_front_face: {
5233       bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5234                Operand(0u), ctx->fs_inputs[fs_input::front_face]).def(0).setHint(vcc);
5235       break;
5236    }
5237    case nir_intrinsic_load_view_index:
5238    case nir_intrinsic_load_layer_id: {
5239       if (instr->intrinsic == nir_intrinsic_load_view_index && (ctx->stage & sw_vs)) {
5240          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5241          bld.copy(Definition(dst), Operand(ctx->view_index));
5242          break;
5243       }
5244
5245       unsigned idx = nir_intrinsic_base(instr);
5246       bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5247                  Operand(2u), bld.m0(ctx->prim_mask), idx, 0);
5248       break;
5249    }
5250    case nir_intrinsic_load_frag_coord: {
5251       emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4);
5252       break;
5253    }
5254    case nir_intrinsic_load_sample_pos: {
5255       Temp posx = ctx->fs_inputs[fs_input::frag_pos_0];
5256       Temp posy = ctx->fs_inputs[fs_input::frag_pos_1];
5257       bld.pseudo(aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5258                  posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand(0u),
5259                  posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand(0u));
5260       break;
5261    }
5262    case nir_intrinsic_load_interpolated_input:
5263       visit_load_interpolated_input(ctx, instr);
5264       break;
5265    case nir_intrinsic_store_output:
5266       visit_store_output(ctx, instr);
5267       break;
5268    case nir_intrinsic_load_input:
5269       visit_load_input(ctx, instr);
5270       break;
5271    case nir_intrinsic_load_ubo:
5272       visit_load_ubo(ctx, instr);
5273       break;
5274    case nir_intrinsic_load_push_constant:
5275       visit_load_push_constant(ctx, instr);
5276       break;
5277    case nir_intrinsic_load_constant:
5278       visit_load_constant(ctx, instr);
5279       break;
5280    case nir_intrinsic_vulkan_resource_index:
5281       visit_load_resource(ctx, instr);
5282       break;
5283    case nir_intrinsic_discard:
5284       visit_discard(ctx, instr);
5285       break;
5286    case nir_intrinsic_discard_if:
5287       visit_discard_if(ctx, instr);
5288       break;
5289    case nir_intrinsic_load_shared:
5290       visit_load_shared(ctx, instr);
5291       break;
5292    case nir_intrinsic_store_shared:
5293       visit_store_shared(ctx, instr);
5294       break;
5295    case nir_intrinsic_shared_atomic_add:
5296    case nir_intrinsic_shared_atomic_imin:
5297    case nir_intrinsic_shared_atomic_umin:
5298    case nir_intrinsic_shared_atomic_imax:
5299    case nir_intrinsic_shared_atomic_umax:
5300    case nir_intrinsic_shared_atomic_and:
5301    case nir_intrinsic_shared_atomic_or:
5302    case nir_intrinsic_shared_atomic_xor:
5303    case nir_intrinsic_shared_atomic_exchange:
5304    case nir_intrinsic_shared_atomic_comp_swap:
5305       visit_shared_atomic(ctx, instr);
5306       break;
5307    case nir_intrinsic_image_deref_load:
5308       visit_image_load(ctx, instr);
5309       break;
5310    case nir_intrinsic_image_deref_store:
5311       visit_image_store(ctx, instr);
5312       break;
5313    case nir_intrinsic_image_deref_atomic_add:
5314    case nir_intrinsic_image_deref_atomic_umin:
5315    case nir_intrinsic_image_deref_atomic_imin:
5316    case nir_intrinsic_image_deref_atomic_umax:
5317    case nir_intrinsic_image_deref_atomic_imax:
5318    case nir_intrinsic_image_deref_atomic_and:
5319    case nir_intrinsic_image_deref_atomic_or:
5320    case nir_intrinsic_image_deref_atomic_xor:
5321    case nir_intrinsic_image_deref_atomic_exchange:
5322    case nir_intrinsic_image_deref_atomic_comp_swap:
5323       visit_image_atomic(ctx, instr);
5324       break;
5325    case nir_intrinsic_image_deref_size:
5326       visit_image_size(ctx, instr);
5327       break;
5328    case nir_intrinsic_load_ssbo:
5329       visit_load_ssbo(ctx, instr);
5330       break;
5331    case nir_intrinsic_store_ssbo:
5332       visit_store_ssbo(ctx, instr);
5333       break;
5334    case nir_intrinsic_load_global:
5335       visit_load_global(ctx, instr);
5336       break;
5337    case nir_intrinsic_store_global:
5338       visit_store_global(ctx, instr);
5339       break;
5340    case nir_intrinsic_ssbo_atomic_add:
5341    case nir_intrinsic_ssbo_atomic_imin:
5342    case nir_intrinsic_ssbo_atomic_umin:
5343    case nir_intrinsic_ssbo_atomic_imax:
5344    case nir_intrinsic_ssbo_atomic_umax:
5345    case nir_intrinsic_ssbo_atomic_and:
5346    case nir_intrinsic_ssbo_atomic_or:
5347    case nir_intrinsic_ssbo_atomic_xor:
5348    case nir_intrinsic_ssbo_atomic_exchange:
5349    case nir_intrinsic_ssbo_atomic_comp_swap:
5350       visit_atomic_ssbo(ctx, instr);
5351       break;
5352    case nir_intrinsic_load_scratch:
5353       visit_load_scratch(ctx, instr);
5354       break;
5355    case nir_intrinsic_store_scratch:
5356       visit_store_scratch(ctx, instr);
5357       break;
5358    case nir_intrinsic_get_buffer_size:
5359       visit_get_buffer_size(ctx, instr);
5360       break;
5361    case nir_intrinsic_barrier: {
5362       unsigned* bsize = ctx->program->info->cs.block_size;
5363       unsigned workgroup_size = bsize[0] * bsize[1] * bsize[2];
5364       if (workgroup_size > 64)
5365          bld.sopp(aco_opcode::s_barrier);
5366       break;
5367    }
5368    case nir_intrinsic_group_memory_barrier:
5369    case nir_intrinsic_memory_barrier:
5370    case nir_intrinsic_memory_barrier_atomic_counter:
5371    case nir_intrinsic_memory_barrier_buffer:
5372    case nir_intrinsic_memory_barrier_image:
5373    case nir_intrinsic_memory_barrier_shared:
5374       emit_memory_barrier(ctx, instr);
5375       break;
5376    case nir_intrinsic_load_num_work_groups:
5377    case nir_intrinsic_load_work_group_id:
5378    case nir_intrinsic_load_local_invocation_id: {
5379       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5380       Temp* ids;
5381       if (instr->intrinsic == nir_intrinsic_load_num_work_groups)
5382          ids = ctx->num_workgroups;
5383       else if (instr->intrinsic == nir_intrinsic_load_work_group_id)
5384          ids = ctx->workgroup_ids;
5385       else
5386          ids = ctx->local_invocation_ids;
5387       bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
5388                  ids[0].id() ? Operand(ids[0]) : Operand(1u),
5389                  ids[1].id() ? Operand(ids[1]) : Operand(1u),
5390                  ids[2].id() ? Operand(ids[2]) : Operand(1u));
5391       emit_split_vector(ctx, dst, 3);
5392       break;
5393    }
5394    case nir_intrinsic_load_local_invocation_index: {
5395       Temp id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
5396                          bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5397       Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5398       bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, id);
5399       break;
5400    }
5401    case nir_intrinsic_load_subgroup_id: {
5402       if (ctx->stage == compute_cs) {
5403          Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), Operand(0xfc0u), ctx->tg_size);
5404          bld.sop2(aco_opcode::s_lshr_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), tg_num, Operand(0x6u));
5405       } else {
5406          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x0u));
5407       }
5408       break;
5409    }
5410    case nir_intrinsic_load_subgroup_invocation: {
5411       bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand((uint32_t) -1),
5412                bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
5413       break;
5414    }
5415    case nir_intrinsic_load_num_subgroups: {
5416       if (ctx->stage == compute_cs)
5417          bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), bld.def(s1, scc), Operand(0x3fu), ctx->tg_size);
5418       else
5419          bld.sop1(aco_opcode::s_mov_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand(0x1u));
5420       break;
5421    }
5422    case nir_intrinsic_ballot: {
5423       Definition tmp = bld.def(s2);
5424       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5425       if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s2) {
5426          bld.sop2(aco_opcode::s_and_b64, tmp, bld.def(s1, scc), Operand(exec, s2), src);
5427       } else if (instr->src[0].ssa->bit_size == 1 && src.regClass() == s1) {
5428          bld.sop2(aco_opcode::s_cselect_b64, tmp, Operand(exec, s2), Operand(0u), bld.scc(src));
5429       } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) {
5430          bld.vopc(aco_opcode::v_cmp_lg_u32, tmp, Operand(0u), src);
5431       } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
5432          bld.vopc(aco_opcode::v_cmp_lg_u64, tmp, Operand(0u), src);
5433       } else {
5434          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5435          nir_print_instr(&instr->instr, stderr);
5436          fprintf(stderr, "\n");
5437       }
5438       emit_wqm(ctx, tmp.getTemp(), get_ssa_temp(ctx, &instr->dest.ssa));
5439       break;
5440    }
5441    case nir_intrinsic_shuffle: {
5442       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5443       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5444          emit_uniform_subgroup(ctx, instr, src);
5445       } else {
5446          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
5447          assert(tid.regClass() == v1);
5448          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5449          if (src.regClass() == v1) {
5450             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5451             emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, src), dst);
5452          } else if (src.regClass() == v2) {
5453             tid = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), tid);
5454
5455             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5456             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5457             lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, lo));
5458             hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), tid, hi));
5459             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5460             emit_split_vector(ctx, dst, 2);
5461          } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5462             Temp tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src);
5463             tmp = emit_extract_vector(ctx, tmp, 0, v1);
5464             tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
5465             emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), tmp), dst);
5466          } else {
5467             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5468             nir_print_instr(&instr->instr, stderr);
5469             fprintf(stderr, "\n");
5470          }
5471       }
5472       break;
5473    }
5474    case nir_intrinsic_load_sample_id: {
5475       bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
5476                ctx->fs_inputs[ancillary], Operand(8u), Operand(4u));
5477       break;
5478    }
5479    case nir_intrinsic_load_sample_mask_in: {
5480       visit_load_sample_mask_in(ctx, instr);
5481       break;
5482    }
5483    case nir_intrinsic_read_first_invocation: {
5484       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5485       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5486       if (src.regClass() == v1) {
5487          emit_wqm(ctx,
5488                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
5489                   dst);
5490       } else if (src.regClass() == v2) {
5491          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5492          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5493          lo = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo));
5494          hi = emit_wqm(ctx, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi));
5495          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5496          emit_split_vector(ctx, dst, 2);
5497       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5498          emit_wqm(ctx,
5499                   bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src,
5500                            bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2))),
5501                   dst);
5502       } else if (src.regClass() == s1) {
5503          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5504       } else if (src.regClass() == s2) {
5505          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5506       } else {
5507          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5508          nir_print_instr(&instr->instr, stderr);
5509          fprintf(stderr, "\n");
5510       }
5511       break;
5512    }
5513    case nir_intrinsic_read_invocation: {
5514       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5515       Temp lane = get_ssa_temp(ctx, instr->src[1].ssa);
5516       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5517       assert(lane.regClass() == s1);
5518       if (src.regClass() == v1) {
5519          emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), src, lane), dst);
5520       } else if (src.regClass() == v2) {
5521          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5522          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5523          lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), lo, lane));
5524          hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_readlane_b32, bld.def(s1), hi, lane));
5525          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5526          emit_split_vector(ctx, dst, 2);
5527       } else if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5528          emit_wqm(ctx, bld.sopc(aco_opcode::s_bitcmp1_b64, bld.def(s1, scc), src, lane), dst);
5529       } else if (src.regClass() == s1) {
5530          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), src);
5531       } else if (src.regClass() == s2) {
5532          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
5533       } else {
5534          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5535          nir_print_instr(&instr->instr, stderr);
5536          fprintf(stderr, "\n");
5537       }
5538       break;
5539    }
5540    case nir_intrinsic_vote_all: {
5541       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5542       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5543       assert(src.regClass() == s2);
5544       assert(dst.regClass() == s1);
5545
5546       Definition tmp = bld.def(s1);
5547       bld.sopc(aco_opcode::s_cmp_eq_u64, bld.scc(tmp),
5548                bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2)),
5549                Operand(exec, s2));
5550       emit_wqm(ctx, tmp.getTemp(), dst);
5551       break;
5552    }
5553    case nir_intrinsic_vote_any: {
5554       Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
5555       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5556       assert(src.regClass() == s2);
5557       assert(dst.regClass() == s1);
5558
5559       Definition tmp = bld.def(s1);
5560       bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.scc(tmp), src, Operand(exec, s2));
5561       emit_wqm(ctx, tmp.getTemp(), dst);
5562       break;
5563    }
5564    case nir_intrinsic_reduce:
5565    case nir_intrinsic_inclusive_scan:
5566    case nir_intrinsic_exclusive_scan: {
5567       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5568       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5569       nir_op op = (nir_op) nir_intrinsic_reduction_op(instr);
5570       unsigned cluster_size = instr->intrinsic == nir_intrinsic_reduce ?
5571          nir_intrinsic_cluster_size(instr) : 0;
5572       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : 64, 64));
5573
5574       if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
5575          emit_uniform_subgroup(ctx, instr, src);
5576       } else if (instr->dest.ssa.bit_size == 1) {
5577          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
5578             op = nir_op_iand;
5579          else if (op == nir_op_iadd)
5580             op = nir_op_ixor;
5581          else if (op == nir_op_umax || op == nir_op_imax)
5582             op = nir_op_ior;
5583          assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor);
5584
5585          switch (instr->intrinsic) {
5586          case nir_intrinsic_reduce:
5587             emit_wqm(ctx, emit_boolean_reduce(ctx, op, cluster_size, src), dst);
5588             break;
5589          case nir_intrinsic_exclusive_scan:
5590             emit_wqm(ctx, emit_boolean_exclusive_scan(ctx, op, src), dst);
5591             break;
5592          case nir_intrinsic_inclusive_scan:
5593             emit_wqm(ctx, emit_boolean_inclusive_scan(ctx, op, src), dst);
5594             break;
5595          default:
5596             assert(false);
5597          }
5598       } else if (cluster_size == 1) {
5599          bld.copy(Definition(dst), src);
5600       } else {
5601          src = as_vgpr(ctx, src);
5602
5603          ReduceOp reduce_op;
5604          switch (op) {
5605          #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
5606             CASE(iadd)
5607             CASE(imul)
5608             CASE(fadd)
5609             CASE(fmul)
5610             CASE(imin)
5611             CASE(umin)
5612             CASE(fmin)
5613             CASE(imax)
5614             CASE(umax)
5615             CASE(fmax)
5616             CASE(iand)
5617             CASE(ior)
5618             CASE(ixor)
5619             default:
5620                unreachable("unknown reduction op");
5621          #undef CASE
5622          }
5623
5624          aco_opcode aco_op;
5625          switch (instr->intrinsic) {
5626             case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break;
5627             case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break;
5628             case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break;
5629             default:
5630                unreachable("unknown reduce intrinsic");
5631          }
5632
5633          aco_ptr<Pseudo_reduction_instruction> reduce{create_instruction<Pseudo_reduction_instruction>(aco_op, Format::PSEUDO_REDUCTION, 3, 5)};
5634          reduce->operands[0] = Operand(src);
5635          // filled in by aco_reduce_assign.cpp, used internally as part of the
5636          // reduce sequence
5637          assert(dst.size() == 1 || dst.size() == 2);
5638          reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear());
5639          reduce->operands[2] = Operand(v1.as_linear());
5640
5641          Temp tmp_dst = bld.tmp(dst.regClass());
5642          reduce->definitions[0] = Definition(tmp_dst);
5643          reduce->definitions[1] = bld.def(s2); // used internally
5644          reduce->definitions[2] = Definition();
5645          reduce->definitions[3] = Definition(scc, s1);
5646          reduce->definitions[4] = Definition();
5647          reduce->reduce_op = reduce_op;
5648          reduce->cluster_size = cluster_size;
5649          ctx->block->instructions.emplace_back(std::move(reduce));
5650
5651          emit_wqm(ctx, tmp_dst, dst);
5652       }
5653       break;
5654    }
5655    case nir_intrinsic_quad_broadcast: {
5656       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5657       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5658          emit_uniform_subgroup(ctx, instr, src);
5659       } else {
5660          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5661          unsigned lane = nir_src_as_const_value(instr->src[1])->u32;
5662          if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5663             uint32_t half_mask = 0x11111111u << lane;
5664             Temp mask_tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand(half_mask), Operand(half_mask));
5665             Temp tmp = bld.tmp(s2);
5666             bld.sop1(aco_opcode::s_wqm_b64, Definition(tmp),
5667                      bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), mask_tmp,
5668                               bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2))));
5669             emit_wqm(ctx, tmp, dst);
5670          } else if (instr->dest.ssa.bit_size == 32) {
5671             emit_wqm(ctx,
5672                      bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src,
5673                                   dpp_quad_perm(lane, lane, lane, lane)),
5674                      dst);
5675          } else if (instr->dest.ssa.bit_size == 64) {
5676             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5677             bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5678             lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_quad_perm(lane, lane, lane, lane)));
5679             hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_quad_perm(lane, lane, lane, lane)));
5680             bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5681             emit_split_vector(ctx, dst, 2);
5682          } else {
5683             fprintf(stderr, "Unimplemented NIR instr bit size: ");
5684             nir_print_instr(&instr->instr, stderr);
5685             fprintf(stderr, "\n");
5686          }
5687       }
5688       break;
5689    }
5690    case nir_intrinsic_quad_swap_horizontal:
5691    case nir_intrinsic_quad_swap_vertical:
5692    case nir_intrinsic_quad_swap_diagonal:
5693    case nir_intrinsic_quad_swizzle_amd: {
5694       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5695       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5696          emit_uniform_subgroup(ctx, instr, src);
5697          break;
5698       }
5699       uint16_t dpp_ctrl = 0;
5700       switch (instr->intrinsic) {
5701       case nir_intrinsic_quad_swap_horizontal:
5702          dpp_ctrl = dpp_quad_perm(1, 0, 3, 2);
5703          break;
5704       case nir_intrinsic_quad_swap_vertical:
5705          dpp_ctrl = dpp_quad_perm(2, 3, 0, 1);
5706          break;
5707       case nir_intrinsic_quad_swap_diagonal:
5708          dpp_ctrl = dpp_quad_perm(3, 2, 1, 0);
5709          break;
5710       case nir_intrinsic_quad_swizzle_amd: {
5711          dpp_ctrl = nir_intrinsic_swizzle_mask(instr);
5712          break;
5713       }
5714       default:
5715          break;
5716       }
5717
5718       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5719       if (instr->dest.ssa.bit_size == 1 && src.regClass() == s2) {
5720          src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
5721          src = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5722          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(s2), Operand(0u), src);
5723          emit_wqm(ctx, tmp, dst);
5724       } else if (instr->dest.ssa.bit_size == 32) {
5725          Temp tmp = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
5726          emit_wqm(ctx, tmp, dst);
5727       } else if (instr->dest.ssa.bit_size == 64) {
5728          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5729          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5730          lo = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl));
5731          hi = emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl));
5732          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5733          emit_split_vector(ctx, dst, 2);
5734       } else {
5735          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5736          nir_print_instr(&instr->instr, stderr);
5737          fprintf(stderr, "\n");
5738       }
5739       break;
5740    }
5741    case nir_intrinsic_masked_swizzle_amd: {
5742       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5743       if (!ctx->divergent_vals[instr->dest.ssa.index]) {
5744          emit_uniform_subgroup(ctx, instr, src);
5745          break;
5746       }
5747       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5748       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
5749       if (dst.regClass() == v1) {
5750          emit_wqm(ctx,
5751                   bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
5752                   dst);
5753       } else if (dst.regClass() == v2) {
5754          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
5755          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
5756          lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
5757          hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
5758          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5759          emit_split_vector(ctx, dst, 2);
5760       } else {
5761          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5762          nir_print_instr(&instr->instr, stderr);
5763          fprintf(stderr, "\n");
5764       }
5765       break;
5766    }
5767    case nir_intrinsic_write_invocation_amd: {
5768       Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
5769       Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa));
5770       Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa));
5771       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5772       if (dst.regClass() == v1) {
5773          /* src2 is ignored for writelane. RA assigns the same reg for dst */
5774          emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val, lane, src), dst);
5775       } else if (dst.regClass() == v2) {
5776          Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1);
5777          Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1);
5778          bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src);
5779          bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
5780          Temp lo = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_lo, lane, src_hi));
5781          Temp hi = emit_wqm(ctx, bld.vop3(aco_opcode::v_writelane_b32, bld.def(v1), val_hi, lane, src_hi));
5782          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
5783          emit_split_vector(ctx, dst, 2);
5784       } else {
5785          fprintf(stderr, "Unimplemented NIR instr bit size: ");
5786          nir_print_instr(&instr->instr, stderr);
5787          fprintf(stderr, "\n");
5788       }
5789       break;
5790    }
5791    case nir_intrinsic_mbcnt_amd: {
5792       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
5793       RegClass rc = RegClass(src.type(), 1);
5794       Temp mask_lo = bld.tmp(rc), mask_hi = bld.tmp(rc);
5795       bld.pseudo(aco_opcode::p_split_vector, Definition(mask_lo), Definition(mask_hi), src);
5796       Temp tmp = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, Operand(0u));
5797       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5798       Temp wqm_tmp = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), mask_hi, tmp);
5799       emit_wqm(ctx, wqm_tmp, dst);
5800       break;
5801    }
5802    case nir_intrinsic_load_helper_invocation: {
5803       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5804       bld.pseudo(aco_opcode::p_load_helper, Definition(dst));
5805       ctx->block->kind |= block_kind_needs_lowering;
5806       ctx->program->needs_exact = true;
5807       break;
5808    }
5809    case nir_intrinsic_is_helper_invocation: {
5810       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5811       bld.pseudo(aco_opcode::p_is_helper, Definition(dst));
5812       ctx->block->kind |= block_kind_needs_lowering;
5813       ctx->program->needs_exact = true;
5814       break;
5815    }
5816    case nir_intrinsic_demote:
5817       bld.pseudo(aco_opcode::p_demote_to_helper);
5818       ctx->block->kind |= block_kind_needs_lowering;
5819       ctx->program->needs_exact = true;
5820       break;
5821    case nir_intrinsic_demote_if: {
5822       Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc),
5823                            as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false),
5824                            Operand(exec, s2));
5825       bld.pseudo(aco_opcode::p_demote_to_helper, cond);
5826       ctx->block->kind |= block_kind_needs_lowering;
5827       ctx->program->needs_exact = true;
5828       break;
5829    }
5830    case nir_intrinsic_first_invocation: {
5831       emit_wqm(ctx, bld.sop1(aco_opcode::s_ff1_i32_b64, bld.def(s1), Operand(exec, s2)),
5832                get_ssa_temp(ctx, &instr->dest.ssa));
5833       break;
5834    }
5835    case nir_intrinsic_shader_clock:
5836       bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)));
5837       break;
5838    case nir_intrinsic_load_vertex_id_zero_base: {
5839       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5840       bld.copy(Definition(dst), ctx->vertex_id);
5841       break;
5842    }
5843    case nir_intrinsic_load_first_vertex: {
5844       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5845       bld.copy(Definition(dst), ctx->base_vertex);
5846       break;
5847    }
5848    case nir_intrinsic_load_base_instance: {
5849       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5850       bld.copy(Definition(dst), ctx->start_instance);
5851       break;
5852    }
5853    case nir_intrinsic_load_instance_id: {
5854       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5855       bld.copy(Definition(dst), ctx->instance_id);
5856       break;
5857    }
5858    case nir_intrinsic_load_draw_id: {
5859       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
5860       bld.copy(Definition(dst), ctx->draw_id);
5861       break;
5862    }
5863    default:
5864       fprintf(stderr, "Unimplemented intrinsic instr: ");
5865       nir_print_instr(&instr->instr, stderr);
5866       fprintf(stderr, "\n");
5867       abort();
5868
5869       break;
5870    }
5871 }
5872
5873
5874 void tex_fetch_ptrs(isel_context *ctx, nir_tex_instr *instr,
5875                     Temp *res_ptr, Temp *samp_ptr, Temp *fmask_ptr,
5876                     enum glsl_base_type *stype)
5877 {
5878    nir_deref_instr *texture_deref_instr = NULL;
5879    nir_deref_instr *sampler_deref_instr = NULL;
5880    int plane = -1;
5881
5882    for (unsigned i = 0; i < instr->num_srcs; i++) {
5883       switch (instr->src[i].src_type) {
5884       case nir_tex_src_texture_deref:
5885          texture_deref_instr = nir_src_as_deref(instr->src[i].src);
5886          break;
5887       case nir_tex_src_sampler_deref:
5888          sampler_deref_instr = nir_src_as_deref(instr->src[i].src);
5889          break;
5890       case nir_tex_src_plane:
5891          plane = nir_src_as_int(instr->src[i].src);
5892          break;
5893       default:
5894          break;
5895       }
5896    }
5897
5898    *stype = glsl_get_sampler_result_type(texture_deref_instr->type);
5899
5900    if (!sampler_deref_instr)
5901       sampler_deref_instr = texture_deref_instr;
5902
5903    if (plane >= 0) {
5904       assert(instr->op != nir_texop_txf_ms &&
5905              instr->op != nir_texop_samples_identical);
5906       assert(instr->sampler_dim  != GLSL_SAMPLER_DIM_BUF);
5907       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false, false);
5908    } else if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF) {
5909       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false, false);
5910    } else {
5911       *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false, false);
5912    }
5913    if (samp_ptr) {
5914       *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false, false);
5915       if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) {
5916          fprintf(stderr, "Unimplemented sampler descriptor: ");
5917          nir_print_instr(&instr->instr, stderr);
5918          fprintf(stderr, "\n");
5919          abort();
5920          // TODO: build samp_ptr = and(samp_ptr, res_ptr)
5921       }
5922    }
5923    if (fmask_ptr && (instr->op == nir_texop_txf_ms ||
5924                      instr->op == nir_texop_samples_identical))
5925       *fmask_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false, false);
5926 }
5927
5928 void build_cube_select(isel_context *ctx, Temp ma, Temp id, Temp deriv,
5929                        Temp *out_ma, Temp *out_sc, Temp *out_tc)
5930 {
5931    Builder bld(ctx->program, ctx->block);
5932
5933    Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1);
5934    Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1);
5935    Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1);
5936
5937    Operand neg_one(0xbf800000u);
5938    Operand one(0x3f800000u);
5939    Operand two(0x40000000u);
5940    Operand four(0x40800000u);
5941
5942    Temp is_ma_positive = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), Operand(0u), ma);
5943    Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive);
5944    Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand(0u), sgn_ma);
5945
5946    Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(s2)), four, id);
5947    Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(s2), two, id);
5948    is_ma_y = bld.sop2(aco_opcode::s_andn2_b64, bld.hint_vcc(bld.def(s2)), is_ma_y, is_ma_z);
5949    Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), is_ma_z, is_ma_y);
5950
5951    // select sc
5952    Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x);
5953    Temp sgn = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1),
5954                        bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z),
5955                        one, is_ma_y);
5956    *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5957
5958    // select tc
5959    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y);
5960    sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y);
5961    *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn);
5962
5963    // select ma
5964    tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
5965                   bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y),
5966                   deriv_z, is_ma_z);
5967    tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7fffffffu), tmp);
5968    *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp);
5969 }
5970
5971 void prepare_cube_coords(isel_context *ctx, Temp* coords, Temp* ddx, Temp* ddy, bool is_deriv, bool is_array)
5972 {
5973    Builder bld(ctx->program, ctx->block);
5974    Temp coord_args[4], ma, tc, sc, id;
5975    for (unsigned i = 0; i < (is_array ? 4 : 3); i++)
5976       coord_args[i] = emit_extract_vector(ctx, *coords, i, v1);
5977
5978    if (is_array) {
5979       coord_args[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_args[3]);
5980
5981       // see comment in ac_prepare_cube_coords()
5982       if (ctx->options->chip_class <= GFX8)
5983          coord_args[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand(0u), coord_args[3]);
5984    }
5985
5986    ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
5987
5988    aco_ptr<VOP3A_instruction> vop3a{create_instruction<VOP3A_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)};
5989    vop3a->operands[0] = Operand(ma);
5990    vop3a->abs[0] = true;
5991    Temp invma = bld.tmp(v1);
5992    vop3a->definitions[0] = Definition(invma);
5993    ctx->block->instructions.emplace_back(std::move(vop3a));
5994
5995    sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
5996    if (!is_deriv)
5997       sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
5998
5999    tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6000    if (!is_deriv)
6001       tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
6002
6003    id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coord_args[0], coord_args[1], coord_args[2]);
6004
6005    if (is_deriv) {
6006       sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma);
6007       tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma);
6008
6009       for (unsigned i = 0; i < 2; i++) {
6010          // see comment in ac_prepare_cube_coords()
6011          Temp deriv_ma;
6012          Temp deriv_sc, deriv_tc;
6013          build_cube_select(ctx, ma, id, i ? *ddy : *ddx,
6014                            &deriv_ma, &deriv_sc, &deriv_tc);
6015
6016          deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma);
6017
6018          Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6019                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma),
6020                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc));
6021          Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1),
6022                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma),
6023                                bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc));
6024          *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y);
6025       }
6026
6027       sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), sc);
6028       tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand(0x3fc00000u/*1.5*/), tc);
6029    }
6030
6031    if (is_array)
6032       id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coord_args[3], id, Operand(0x41000000u/*8.0*/));
6033    *coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3), sc, tc, id);
6034
6035 }
6036
6037 Temp apply_round_slice(isel_context *ctx, Temp coords, unsigned idx)
6038 {
6039    Temp coord_vec[3];
6040    for (unsigned i = 0; i < coords.size(); i++)
6041       coord_vec[i] = emit_extract_vector(ctx, coords, i, v1);
6042
6043    Builder bld(ctx->program, ctx->block);
6044    coord_vec[idx] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coord_vec[idx]);
6045
6046    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6047    for (unsigned i = 0; i < coords.size(); i++)
6048       vec->operands[i] = Operand(coord_vec[i]);
6049    Temp res = bld.tmp(RegType::vgpr, coords.size());
6050    vec->definitions[0] = Definition(res);
6051    ctx->block->instructions.emplace_back(std::move(vec));
6052    return res;
6053 }
6054
6055 void get_const_vec(nir_ssa_def *vec, nir_const_value *cv[4])
6056 {
6057    if (vec->parent_instr->type != nir_instr_type_alu)
6058       return;
6059    nir_alu_instr *vec_instr = nir_instr_as_alu(vec->parent_instr);
6060    if (vec_instr->op != nir_op_vec(vec->num_components))
6061       return;
6062
6063    for (unsigned i = 0; i < vec->num_components; i++) {
6064       cv[i] = vec_instr->src[i].swizzle[0] == 0 ?
6065               nir_src_as_const_value(vec_instr->src[i].src) : NULL;
6066    }
6067 }
6068
6069 void visit_tex(isel_context *ctx, nir_tex_instr *instr)
6070 {
6071    Builder bld(ctx->program, ctx->block);
6072    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
6073         has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
6074    Temp resource, sampler, fmask_ptr, bias = Temp(), coords, compare = Temp(), sample_index = Temp(),
6075         lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(), derivs = Temp();
6076    nir_const_value *sample_index_cv = NULL;
6077    nir_const_value *const_offset[4] = {NULL, NULL, NULL, NULL};
6078    enum glsl_base_type stype;
6079    tex_fetch_ptrs(ctx, instr, &resource, &sampler, &fmask_ptr, &stype);
6080
6081    bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 &&
6082                                   (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT);
6083    bool tg4_integer_cube_workaround = tg4_integer_workarounds &&
6084                                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
6085
6086    for (unsigned i = 0; i < instr->num_srcs; i++) {
6087       switch (instr->src[i].src_type) {
6088       case nir_tex_src_coord:
6089          coords = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[i].src.ssa));
6090          break;
6091       case nir_tex_src_bias:
6092          if (instr->op == nir_texop_txb) {
6093             bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
6094             has_bias = true;
6095          }
6096          break;
6097       case nir_tex_src_lod: {
6098          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
6099
6100          if (val && val->f32 <= 0.0) {
6101             level_zero = true;
6102          } else {
6103             lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
6104             has_lod = true;
6105          }
6106          break;
6107       }
6108       case nir_tex_src_comparator:
6109          if (instr->is_shadow) {
6110             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
6111             has_compare = true;
6112          }
6113          break;
6114       case nir_tex_src_offset:
6115          offset = get_ssa_temp(ctx, instr->src[i].src.ssa);
6116          get_const_vec(instr->src[i].src.ssa, const_offset);
6117          has_offset = true;
6118          break;
6119       case nir_tex_src_ddx:
6120          ddx = get_ssa_temp(ctx, instr->src[i].src.ssa);
6121          has_ddx = true;
6122          break;
6123       case nir_tex_src_ddy:
6124          ddy = get_ssa_temp(ctx, instr->src[i].src.ssa);
6125          has_ddy = true;
6126          break;
6127       case nir_tex_src_ms_index:
6128          sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa);
6129          sample_index_cv = nir_src_as_const_value(instr->src[i].src);
6130          has_sample_index = true;
6131          break;
6132       case nir_tex_src_texture_offset:
6133       case nir_tex_src_sampler_offset:
6134       default:
6135          break;
6136       }
6137    }
6138 // TODO: all other cases: structure taken from ac_nir_to_llvm.c
6139    if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6140       return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa), true);
6141
6142    if (instr->op == nir_texop_texture_samples) {
6143       Temp dword3 = emit_extract_vector(ctx, resource, 3, s1);
6144
6145       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
6146       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
6147       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
6148       Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
6149
6150       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
6151                samples, Operand(1u), bld.scc(is_msaa));
6152       return;
6153    }
6154
6155    if (has_offset && instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms) {
6156       aco_ptr<Instruction> tmp_instr;
6157       Temp acc, pack = Temp();
6158
6159       uint32_t pack_const = 0;
6160       for (unsigned i = 0; i < offset.size(); i++) {
6161          if (!const_offset[i])
6162             continue;
6163          pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i);
6164       }
6165
6166       if (offset.type() == RegType::sgpr) {
6167          for (unsigned i = 0; i < offset.size(); i++) {
6168             if (const_offset[i])
6169                continue;
6170
6171             acc = emit_extract_vector(ctx, offset, i, s1);
6172             acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(0x3Fu));
6173
6174             if (i) {
6175                acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, Operand(8u * i));
6176             }
6177
6178             if (pack == Temp()) {
6179                pack = acc;
6180             } else {
6181                pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc);
6182             }
6183          }
6184
6185          if (pack_const && pack != Temp())
6186             pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), Operand(pack_const), pack);
6187       } else {
6188          for (unsigned i = 0; i < offset.size(); i++) {
6189             if (const_offset[i])
6190                continue;
6191
6192             acc = emit_extract_vector(ctx, offset, i, v1);
6193             acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x3Fu), acc);
6194
6195             if (i) {
6196                acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(8u * i), acc);
6197             }
6198
6199             if (pack == Temp()) {
6200                pack = acc;
6201             } else {
6202                pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc);
6203             }
6204          }
6205
6206          if (pack_const && pack != Temp())
6207             pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand(pack_const), pack);
6208       }
6209       if (pack_const && pack == Temp())
6210          offset = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(pack_const));
6211       else if (pack == Temp())
6212          has_offset = false;
6213       else
6214          offset = pack;
6215    }
6216
6217    if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components)
6218       prepare_cube_coords(ctx, &coords, &ddx, &ddy, instr->op == nir_texop_txd, instr->is_array && instr->op != nir_texop_lod);
6219
6220    /* pack derivatives */
6221    if (has_ddx || has_ddy) {
6222       if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class >= GFX9) {
6223          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(v4),
6224                              ddx, Operand(0u), ddy, Operand(0u));
6225       } else {
6226          derivs = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, ddx.size() + ddy.size()), ddx, ddy);
6227       }
6228       has_derivs = true;
6229    }
6230
6231    if (instr->coord_components > 1 &&
6232        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6233        instr->is_array &&
6234        instr->op != nir_texop_txf)
6235       coords = apply_round_slice(ctx, coords, 1);
6236
6237    if (instr->coord_components > 2 &&
6238       (instr->sampler_dim == GLSL_SAMPLER_DIM_2D ||
6239        instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6240        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS ||
6241        instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6242        instr->is_array &&
6243        instr->op != nir_texop_txf && instr->op != nir_texop_txf_ms)
6244       coords = apply_round_slice(ctx, coords, 2);
6245
6246    if (ctx->options->chip_class >= GFX9 &&
6247        instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6248        instr->op != nir_texop_lod && instr->coord_components) {
6249       assert(coords.size() > 0 && coords.size() < 3);
6250
6251       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size() + 1, 1)};
6252       vec->operands[0] = Operand(emit_extract_vector(ctx, coords, 0, v1));
6253       vec->operands[1] = instr->op == nir_texop_txf ? Operand((uint32_t) 0) : Operand((uint32_t) 0x3f000000);
6254       if (coords.size() > 1)
6255          vec->operands[2] = Operand(emit_extract_vector(ctx, coords, 1, v1));
6256       coords = bld.tmp(RegType::vgpr, coords.size() + 1);
6257       vec->definitions[0] = Definition(coords);
6258       ctx->block->instructions.emplace_back(std::move(vec));
6259    }
6260
6261    bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array);
6262
6263    if (instr->op == nir_texop_samples_identical)
6264       resource = fmask_ptr;
6265
6266    else if ((instr->sampler_dim == GLSL_SAMPLER_DIM_MS ||
6267              instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) &&
6268             instr->op != nir_texop_txs) {
6269       assert(has_sample_index);
6270       Operand op(sample_index);
6271       if (sample_index_cv)
6272          op = Operand(sample_index_cv->u32);
6273       sample_index = adjust_sample_index_using_fmask(ctx, da, coords, op, fmask_ptr);
6274    }
6275
6276    if (has_offset && (instr->op == nir_texop_txf || instr->op == nir_texop_txf_ms)) {
6277       Temp split_coords[coords.size()];
6278       emit_split_vector(ctx, coords, coords.size());
6279       for (unsigned i = 0; i < coords.size(); i++)
6280          split_coords[i] = emit_extract_vector(ctx, coords, i, v1);
6281
6282       unsigned i = 0;
6283       for (; i < std::min(offset.size(), instr->coord_components); i++) {
6284          Temp off = emit_extract_vector(ctx, offset, i, v1);
6285          split_coords[i] = bld.vadd32(bld.def(v1), split_coords[i], off);
6286       }
6287
6288       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)};
6289       for (unsigned i = 0; i < coords.size(); i++)
6290          vec->operands[i] = Operand(split_coords[i]);
6291       coords = bld.tmp(coords.regClass());
6292       vec->definitions[0] = Definition(coords);
6293       ctx->block->instructions.emplace_back(std::move(vec));
6294
6295       has_offset = false;
6296    }
6297
6298    /* Build tex instruction */
6299    unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa);
6300    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6301    Temp tmp_dst = dst;
6302
6303    /* gather4 selects the component by dmask and always returns vec4 */
6304    if (instr->op == nir_texop_tg4) {
6305       assert(instr->dest.ssa.num_components == 4);
6306       if (instr->is_shadow)
6307          dmask = 1;
6308       else
6309          dmask = 1 << instr->component;
6310       if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr)
6311          tmp_dst = bld.tmp(v4);
6312    } else if (instr->op == nir_texop_samples_identical) {
6313       tmp_dst = bld.tmp(v1);
6314    } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || dst.type() == RegType::sgpr) {
6315       tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask)));
6316    }
6317
6318    aco_ptr<MIMG_instruction> tex;
6319    if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) {
6320       if (!has_lod)
6321          lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6322
6323       bool div_by_6 = instr->op == nir_texop_txs &&
6324                       instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
6325                       instr->is_array &&
6326                       (dmask & (1 << 2));
6327       if (tmp_dst.id() == dst.id() && div_by_6)
6328          tmp_dst = bld.tmp(tmp_dst.regClass());
6329
6330       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6331       tex->operands[0] = Operand(as_vgpr(ctx,lod));
6332       tex->operands[1] = Operand(resource);
6333       if (ctx->options->chip_class >= GFX9 &&
6334           instr->op == nir_texop_txs &&
6335           instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
6336           instr->is_array) {
6337          tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1);
6338       } else if (instr->op == nir_texop_query_levels) {
6339          tex->dmask = 1 << 3;
6340       } else {
6341          tex->dmask = dmask;
6342       }
6343       tex->da = da;
6344       tex->definitions[0] = Definition(tmp_dst);
6345       tex->can_reorder = true;
6346       ctx->block->instructions.emplace_back(std::move(tex));
6347
6348       if (div_by_6) {
6349          /* divide 3rd value by 6 by multiplying with magic number */
6350          emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6351          Temp c = bld.copy(bld.def(s1), Operand((uint32_t) 0x2AAAAAAB));
6352          Temp by_6 = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), emit_extract_vector(ctx, tmp_dst, 2, v1), c);
6353          assert(instr->dest.ssa.num_components == 3);
6354          Temp tmp = dst.type() == RegType::vgpr ? dst : bld.tmp(v3);
6355          tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6356                               emit_extract_vector(ctx, tmp_dst, 0, v1),
6357                               emit_extract_vector(ctx, tmp_dst, 1, v1),
6358                               by_6);
6359
6360       }
6361
6362       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6363       return;
6364    }
6365
6366    Temp tg4_compare_cube_wa64 = Temp();
6367
6368    if (tg4_integer_workarounds) {
6369       tex.reset(create_instruction<MIMG_instruction>(aco_opcode::image_get_resinfo, Format::MIMG, 2, 1));
6370       tex->operands[0] = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
6371       tex->operands[1] = Operand(resource);
6372       tex->dmask = 0x3;
6373       tex->da = da;
6374       Temp size = bld.tmp(v2);
6375       tex->definitions[0] = Definition(size);
6376       tex->can_reorder = true;
6377       ctx->block->instructions.emplace_back(std::move(tex));
6378       emit_split_vector(ctx, size, size.size());
6379
6380       Temp half_texel[2];
6381       for (unsigned i = 0; i < 2; i++) {
6382          half_texel[i] = emit_extract_vector(ctx, size, i, v1);
6383          half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]);
6384          half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]);
6385          half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0xbf000000/*-0.5*/), half_texel[i]);
6386       }
6387
6388       Temp orig_coords[2] = {
6389          emit_extract_vector(ctx, coords, 0, v1),
6390          emit_extract_vector(ctx, coords, 1, v1)};
6391       Temp new_coords[2] = {
6392          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[0], half_texel[0]),
6393          bld.vop2(aco_opcode::v_add_f32, bld.def(v1), orig_coords[1], half_texel[1])
6394       };
6395
6396       if (tg4_integer_cube_workaround) {
6397          // see comment in ac_nir_to_llvm.c's lower_gather4_integer()
6398          Temp desc[resource.size()];
6399          aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector,
6400                                                                            Format::PSEUDO, 1, resource.size())};
6401          split->operands[0] = Operand(resource);
6402          for (unsigned i = 0; i < resource.size(); i++) {
6403             desc[i] = bld.tmp(s1);
6404             split->definitions[i] = Definition(desc[i]);
6405          }
6406          ctx->block->instructions.emplace_back(std::move(split));
6407
6408          Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], Operand(20u | (6u << 16)));
6409          Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt,
6410                                          Operand((uint32_t)V_008F14_IMG_DATA_FORMAT_8_8_8_8));
6411
6412          Temp nfmt;
6413          if (stype == GLSL_TYPE_UINT) {
6414             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6415                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_USCALED),
6416                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_UINT),
6417                             bld.scc(compare_cube_wa));
6418          } else {
6419             nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1),
6420                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SSCALED),
6421                             Operand((uint32_t)V_008F14_IMG_NUM_FORMAT_SINT),
6422                             bld.scc(compare_cube_wa));
6423          }
6424          tg4_compare_cube_wa64 = as_divergent_bool(ctx, compare_cube_wa, true);
6425          nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, Operand(26u));
6426
6427          desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1],
6428                             Operand((uint32_t)C_008F14_NUM_FORMAT));
6429          desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt);
6430
6431          aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
6432                                                                          Format::PSEUDO, resource.size(), 1)};
6433          for (unsigned i = 0; i < resource.size(); i++)
6434             vec->operands[i] = Operand(desc[i]);
6435          resource = bld.tmp(resource.regClass());
6436          vec->definitions[0] = Definition(resource);
6437          ctx->block->instructions.emplace_back(std::move(vec));
6438
6439          new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6440                                   new_coords[0], orig_coords[0], tg4_compare_cube_wa64);
6441          new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
6442                                   new_coords[1], orig_coords[1], tg4_compare_cube_wa64);
6443       }
6444
6445       if (coords.size() == 3) {
6446          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v3),
6447                              new_coords[0], new_coords[1],
6448                              emit_extract_vector(ctx, coords, 2, v1));
6449       } else {
6450          assert(coords.size() == 2);
6451          coords = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2),
6452                              new_coords[0], new_coords[1]);
6453       }
6454    }
6455
6456    if (!(has_ddx && has_ddy) && !has_lod && !level_zero &&
6457        instr->sampler_dim != GLSL_SAMPLER_DIM_MS &&
6458        instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS)
6459       coords = emit_wqm(ctx, coords, bld.tmp(coords.regClass()), true);
6460
6461    std::vector<Operand> args;
6462    if (has_offset)
6463       args.emplace_back(Operand(offset));
6464    if (has_bias)
6465       args.emplace_back(Operand(bias));
6466    if (has_compare)
6467       args.emplace_back(Operand(compare));
6468    if (has_derivs)
6469       args.emplace_back(Operand(derivs));
6470    args.emplace_back(Operand(coords));
6471    if (has_sample_index)
6472       args.emplace_back(Operand(sample_index));
6473    if (has_lod)
6474       args.emplace_back(lod);
6475
6476    Operand arg;
6477    if (args.size() > 1) {
6478       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
6479       unsigned size = 0;
6480       for (unsigned i = 0; i < args.size(); i++) {
6481          size += args[i].size();
6482          vec->operands[i] = args[i];
6483       }
6484       RegClass rc = RegClass(RegType::vgpr, size);
6485       Temp tmp = bld.tmp(rc);
6486       vec->definitions[0] = Definition(tmp);
6487       ctx->block->instructions.emplace_back(std::move(vec));
6488       arg = Operand(tmp);
6489    } else {
6490       assert(args[0].isTemp());
6491       arg = Operand(as_vgpr(ctx, args[0].getTemp()));
6492    }
6493
6494    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
6495       //FIXME: if (ctx->abi->gfx9_stride_size_workaround) return ac_build_buffer_load_format_gfx9_safe()
6496
6497       assert(coords.size() == 1);
6498       unsigned last_bit = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
6499       aco_opcode op;
6500       switch (last_bit) {
6501       case 1:
6502          op = aco_opcode::buffer_load_format_x; break;
6503       case 2:
6504          op = aco_opcode::buffer_load_format_xy; break;
6505       case 3:
6506          op = aco_opcode::buffer_load_format_xyz; break;
6507       case 4:
6508          op = aco_opcode::buffer_load_format_xyzw; break;
6509       default:
6510          unreachable("Tex instruction loads more than 4 components.");
6511       }
6512
6513       /* if the instruction return value matches exactly the nir dest ssa, we can use it directly */
6514       if (last_bit == instr->dest.ssa.num_components && dst.type() == RegType::vgpr)
6515          tmp_dst = dst;
6516       else
6517          tmp_dst = bld.tmp(RegType::vgpr, last_bit);
6518
6519       aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
6520       mubuf->operands[0] = Operand(coords);
6521       mubuf->operands[1] = Operand(resource);
6522       mubuf->operands[2] = Operand((uint32_t) 0);
6523       mubuf->definitions[0] = Definition(tmp_dst);
6524       mubuf->idxen = true;
6525       mubuf->can_reorder = true;
6526       ctx->block->instructions.emplace_back(std::move(mubuf));
6527
6528       expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
6529       return;
6530    }
6531
6532
6533    if (instr->op == nir_texop_txf ||
6534        instr->op == nir_texop_txf_ms ||
6535        instr->op == nir_texop_samples_identical) {
6536       aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? aco_opcode::image_load : aco_opcode::image_load_mip;
6537       tex.reset(create_instruction<MIMG_instruction>(op, Format::MIMG, 2, 1));
6538       tex->operands[0] = Operand(arg);
6539       tex->operands[1] = Operand(resource);
6540       tex->dmask = dmask;
6541       tex->unrm = true;
6542       tex->da = da;
6543       tex->definitions[0] = Definition(tmp_dst);
6544       tex->can_reorder = true;
6545       ctx->block->instructions.emplace_back(std::move(tex));
6546
6547       if (instr->op == nir_texop_samples_identical) {
6548          assert(dmask == 1 && dst.regClass() == v1);
6549          assert(dst.id() != tmp_dst.id());
6550
6551          Temp tmp = bld.tmp(s2);
6552          bld.vopc(aco_opcode::v_cmp_eq_u32, Definition(tmp), Operand(0u), tmp_dst).def(0).setHint(vcc);
6553          bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand((uint32_t)-1), tmp);
6554
6555       } else {
6556          expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask);
6557       }
6558       return;
6559    }
6560
6561    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
6562    aco_opcode opcode = aco_opcode::image_sample;
6563    if (has_offset) { /* image_sample_*_o */
6564       if (has_compare) {
6565          opcode = aco_opcode::image_sample_c_o;
6566          if (has_derivs)
6567             opcode = aco_opcode::image_sample_c_d_o;
6568          if (has_bias)
6569             opcode = aco_opcode::image_sample_c_b_o;
6570          if (level_zero)
6571             opcode = aco_opcode::image_sample_c_lz_o;
6572          if (has_lod)
6573             opcode = aco_opcode::image_sample_c_l_o;
6574       } else {
6575          opcode = aco_opcode::image_sample_o;
6576          if (has_derivs)
6577             opcode = aco_opcode::image_sample_d_o;
6578          if (has_bias)
6579             opcode = aco_opcode::image_sample_b_o;
6580          if (level_zero)
6581             opcode = aco_opcode::image_sample_lz_o;
6582          if (has_lod)
6583             opcode = aco_opcode::image_sample_l_o;
6584       }
6585    } else { /* no offset */
6586       if (has_compare) {
6587          opcode = aco_opcode::image_sample_c;
6588          if (has_derivs)
6589             opcode = aco_opcode::image_sample_c_d;
6590          if (has_bias)
6591             opcode = aco_opcode::image_sample_c_b;
6592          if (level_zero)
6593             opcode = aco_opcode::image_sample_c_lz;
6594          if (has_lod)
6595             opcode = aco_opcode::image_sample_c_l;
6596       } else {
6597          opcode = aco_opcode::image_sample;
6598          if (has_derivs)
6599             opcode = aco_opcode::image_sample_d;
6600          if (has_bias)
6601             opcode = aco_opcode::image_sample_b;
6602          if (level_zero)
6603             opcode = aco_opcode::image_sample_lz;
6604          if (has_lod)
6605             opcode = aco_opcode::image_sample_l;
6606       }
6607    }
6608
6609    if (instr->op == nir_texop_tg4) {
6610       if (has_offset) {
6611          opcode = aco_opcode::image_gather4_lz_o;
6612          if (has_compare)
6613             opcode = aco_opcode::image_gather4_c_lz_o;
6614       } else {
6615          opcode = aco_opcode::image_gather4_lz;
6616          if (has_compare)
6617             opcode = aco_opcode::image_gather4_c_lz;
6618       }
6619    } else if (instr->op == nir_texop_lod) {
6620       opcode = aco_opcode::image_get_lod;
6621    }
6622
6623    tex.reset(create_instruction<MIMG_instruction>(opcode, Format::MIMG, 3, 1));
6624    tex->operands[0] = arg;
6625    tex->operands[1] = Operand(resource);
6626    tex->operands[2] = Operand(sampler);
6627    tex->dmask = dmask;
6628    tex->da = da;
6629    tex->definitions[0] = Definition(tmp_dst);
6630    tex->can_reorder = true;
6631    ctx->block->instructions.emplace_back(std::move(tex));
6632
6633    if (tg4_integer_cube_workaround) {
6634       assert(tmp_dst.id() != dst.id());
6635       assert(tmp_dst.size() == dst.size() && dst.size() == 4);
6636
6637       emit_split_vector(ctx, tmp_dst, tmp_dst.size());
6638       Temp val[4];
6639       for (unsigned i = 0; i < dst.size(); i++) {
6640          val[i] = emit_extract_vector(ctx, tmp_dst, i, v1);
6641          Temp cvt_val;
6642          if (stype == GLSL_TYPE_UINT)
6643             cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]);
6644          else
6645             cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]);
6646          val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, tg4_compare_cube_wa64);
6647       }
6648       Temp tmp = dst.regClass() == v4 ? dst : bld.tmp(v4);
6649       tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp),
6650                            val[0], val[1], val[2], val[3]);
6651    }
6652    unsigned mask = instr->op == nir_texop_tg4 ? 0xF : dmask;
6653    expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask);
6654
6655 }
6656
6657
6658 Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
6659 {
6660    Temp tmp = get_ssa_temp(ctx, ssa);
6661    if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
6662       return Operand(tmp.regClass());
6663    else
6664       return Operand(tmp);
6665 }
6666
6667 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
6668 {
6669    aco_ptr<Pseudo_instruction> phi;
6670    unsigned num_src = exec_list_length(&instr->srcs);
6671    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
6672
6673    aco_opcode opcode = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index] ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
6674
6675    std::map<unsigned, nir_ssa_def*> phi_src;
6676    bool all_undef = true;
6677    nir_foreach_phi_src(src, instr) {
6678       phi_src[src->pred->index] = src->src.ssa;
6679       if (src->src.ssa->parent_instr->type != nir_instr_type_ssa_undef)
6680          all_undef = false;
6681    }
6682    if (all_undef) {
6683       Builder bld(ctx->program, ctx->block);
6684       if (dst.regClass() == s1) {
6685          bld.sop1(aco_opcode::s_mov_b32, Definition(dst), Operand(0u));
6686       } else if (dst.regClass() == v1) {
6687          bld.vop1(aco_opcode::v_mov_b32, Definition(dst), Operand(0u));
6688       } else {
6689          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6690          for (unsigned i = 0; i < dst.size(); i++)
6691             vec->operands[i] = Operand(0u);
6692          vec->definitions[0] = Definition(dst);
6693          ctx->block->instructions.emplace_back(std::move(vec));
6694       }
6695       return;
6696    }
6697
6698    /* try to scalarize vector phis */
6699    if (dst.size() > 1) {
6700       // TODO: scalarize linear phis on divergent ifs
6701       bool can_scalarize = (opcode == aco_opcode::p_phi || !(ctx->block->kind & block_kind_merge));
6702       std::array<Temp, 4> new_vec;
6703       for (std::pair<const unsigned, nir_ssa_def*>& pair : phi_src) {
6704          Operand src = get_phi_operand(ctx, pair.second);
6705          if (src.isTemp() && ctx->allocated_vec.find(src.tempId()) == ctx->allocated_vec.end()) {
6706             can_scalarize = false;
6707             break;
6708          }
6709       }
6710       if (can_scalarize) {
6711          unsigned num_components = instr->dest.ssa.num_components;
6712          assert(dst.size() % num_components == 0);
6713          RegClass rc = RegClass(dst.type(), dst.size() / num_components);
6714
6715          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
6716          for (unsigned k = 0; k < num_components; k++) {
6717             phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src, 1));
6718             std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6719             for (unsigned i = 0; i < num_src; i++) {
6720                Operand src = get_phi_operand(ctx, it->second);
6721                phi->operands[i] = src.isTemp() ? Operand(ctx->allocated_vec[src.tempId()][k]) : Operand(rc);
6722                ++it;
6723             }
6724             Temp phi_dst = {ctx->program->allocateId(), rc};
6725             phi->definitions[0] = Definition(phi_dst);
6726             ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6727             new_vec[k] = phi_dst;
6728             vec->operands[k] = Operand(phi_dst);
6729          }
6730          vec->definitions[0] = Definition(dst);
6731          ctx->block->instructions.emplace_back(std::move(vec));
6732          ctx->allocated_vec.emplace(dst.id(), new_vec);
6733          return;
6734       }
6735    }
6736
6737    unsigned extra_src = 0;
6738    if (opcode == aco_opcode::p_linear_phi && (ctx->block->kind & block_kind_loop_exit) &&
6739        ctx->program->blocks[ctx->block->index-2].kind & block_kind_continue_or_break) {
6740       extra_src++;
6741    }
6742
6743    phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_src + extra_src, 1));
6744
6745    /* if we have a linear phi on a divergent if, we know that one src is undef */
6746    if (opcode == aco_opcode::p_linear_phi && ctx->block->kind & block_kind_merge) {
6747       assert(extra_src == 0);
6748       Block* block;
6749       /* we place the phi either in the invert-block or in the current block */
6750       if (phi_src.begin()->second->parent_instr->type != nir_instr_type_ssa_undef) {
6751          assert((++phi_src.begin())->second->parent_instr->type == nir_instr_type_ssa_undef);
6752          Block& linear_else = ctx->program->blocks[ctx->block->linear_preds[1]];
6753          block = &ctx->program->blocks[linear_else.linear_preds[0]];
6754          assert(block->kind & block_kind_invert);
6755          phi->operands[0] = get_phi_operand(ctx, phi_src.begin()->second);
6756       } else {
6757          assert((++phi_src.begin())->second->parent_instr->type != nir_instr_type_ssa_undef);
6758          block = ctx->block;
6759          phi->operands[0] = get_phi_operand(ctx, (++phi_src.begin())->second);
6760       }
6761       phi->operands[1] = Operand(dst.regClass());
6762       phi->definitions[0] = Definition(dst);
6763       block->instructions.emplace(block->instructions.begin(), std::move(phi));
6764       return;
6765    }
6766
6767    std::map<unsigned, nir_ssa_def*>::iterator it = phi_src.begin();
6768    for (unsigned i = 0; i < num_src; i++) {
6769       phi->operands[i] = get_phi_operand(ctx, it->second);
6770       ++it;
6771    }
6772    for (unsigned i = 0; i < extra_src; i++)
6773       phi->operands[num_src + i] = Operand(dst.regClass());
6774    phi->definitions[0] = Definition(dst);
6775    ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi));
6776 }
6777
6778
6779 void visit_undef(isel_context *ctx, nir_ssa_undef_instr *instr)
6780 {
6781    Temp dst = get_ssa_temp(ctx, &instr->def);
6782
6783    assert(dst.type() == RegType::sgpr);
6784
6785    if (dst.size() == 1) {
6786       Builder(ctx->program, ctx->block).copy(Definition(dst), Operand(0u));
6787    } else {
6788       aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)};
6789       for (unsigned i = 0; i < dst.size(); i++)
6790          vec->operands[i] = Operand(0u);
6791       vec->definitions[0] = Definition(dst);
6792       ctx->block->instructions.emplace_back(std::move(vec));
6793    }
6794 }
6795
6796 void visit_jump(isel_context *ctx, nir_jump_instr *instr)
6797 {
6798    Builder bld(ctx->program, ctx->block);
6799    Block *logical_target;
6800    append_logical_end(ctx->block);
6801    unsigned idx = ctx->block->index;
6802
6803    switch (instr->type) {
6804    case nir_jump_break:
6805       logical_target = ctx->cf_info.parent_loop.exit;
6806       add_logical_edge(idx, logical_target);
6807       ctx->block->kind |= block_kind_break;
6808
6809       if (!ctx->cf_info.parent_if.is_divergent &&
6810           !ctx->cf_info.parent_loop.has_divergent_continue) {
6811          /* uniform break - directly jump out of the loop */
6812          ctx->block->kind |= block_kind_uniform;
6813          ctx->cf_info.has_branch = true;
6814          bld.branch(aco_opcode::p_branch);
6815          add_linear_edge(idx, logical_target);
6816          return;
6817       }
6818       ctx->cf_info.parent_loop.has_divergent_branch = true;
6819       break;
6820    case nir_jump_continue:
6821       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6822       add_logical_edge(idx, logical_target);
6823       ctx->block->kind |= block_kind_continue;
6824
6825       if (ctx->cf_info.parent_if.is_divergent) {
6826          /* for potential uniform breaks after this continue,
6827             we must ensure that they are handled correctly */
6828          ctx->cf_info.parent_loop.has_divergent_continue = true;
6829          ctx->cf_info.parent_loop.has_divergent_branch = true;
6830       } else {
6831          /* uniform continue - directly jump to the loop header */
6832          ctx->block->kind |= block_kind_uniform;
6833          ctx->cf_info.has_branch = true;
6834          bld.branch(aco_opcode::p_branch);
6835          add_linear_edge(idx, logical_target);
6836          return;
6837       }
6838       break;
6839    default:
6840       fprintf(stderr, "Unknown NIR jump instr: ");
6841       nir_print_instr(&instr->instr, stderr);
6842       fprintf(stderr, "\n");
6843       abort();
6844    }
6845
6846    /* remove critical edges from linear CFG */
6847    bld.branch(aco_opcode::p_branch);
6848    Block* break_block = ctx->program->create_and_insert_block();
6849    break_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6850    break_block->kind |= block_kind_uniform;
6851    add_linear_edge(idx, break_block);
6852    /* the loop_header pointer might be invalidated by this point */
6853    if (instr->type == nir_jump_continue)
6854       logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx];
6855    add_linear_edge(break_block->index, logical_target);
6856    bld.reset(break_block);
6857    bld.branch(aco_opcode::p_branch);
6858
6859    Block* continue_block = ctx->program->create_and_insert_block();
6860    continue_block->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6861    add_linear_edge(idx, continue_block);
6862    append_logical_start(continue_block);
6863    ctx->block = continue_block;
6864    return;
6865 }
6866
6867 void visit_block(isel_context *ctx, nir_block *block)
6868 {
6869    nir_foreach_instr(instr, block) {
6870       switch (instr->type) {
6871       case nir_instr_type_alu:
6872          visit_alu_instr(ctx, nir_instr_as_alu(instr));
6873          break;
6874       case nir_instr_type_load_const:
6875          visit_load_const(ctx, nir_instr_as_load_const(instr));
6876          break;
6877       case nir_instr_type_intrinsic:
6878          visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
6879          break;
6880       case nir_instr_type_tex:
6881          visit_tex(ctx, nir_instr_as_tex(instr));
6882          break;
6883       case nir_instr_type_phi:
6884          visit_phi(ctx, nir_instr_as_phi(instr));
6885          break;
6886       case nir_instr_type_ssa_undef:
6887          visit_undef(ctx, nir_instr_as_ssa_undef(instr));
6888          break;
6889       case nir_instr_type_deref:
6890          break;
6891       case nir_instr_type_jump:
6892          visit_jump(ctx, nir_instr_as_jump(instr));
6893          break;
6894       default:
6895          fprintf(stderr, "Unknown NIR instr type: ");
6896          nir_print_instr(instr, stderr);
6897          fprintf(stderr, "\n");
6898          //abort();
6899       }
6900    }
6901 }
6902
6903
6904
6905 static void visit_loop(isel_context *ctx, nir_loop *loop)
6906 {
6907    append_logical_end(ctx->block);
6908    ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
6909    Builder bld(ctx->program, ctx->block);
6910    bld.branch(aco_opcode::p_branch);
6911    unsigned loop_preheader_idx = ctx->block->index;
6912
6913    Block loop_exit = Block();
6914    loop_exit.loop_nest_depth = ctx->cf_info.loop_nest_depth;
6915    loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level));
6916
6917    Block* loop_header = ctx->program->create_and_insert_block();
6918    loop_header->loop_nest_depth = ctx->cf_info.loop_nest_depth + 1;
6919    loop_header->kind |= block_kind_loop_header;
6920    add_edge(loop_preheader_idx, loop_header);
6921    ctx->block = loop_header;
6922
6923    /* emit loop body */
6924    unsigned loop_header_idx = loop_header->index;
6925    loop_info_RAII loop_raii(ctx, loop_header_idx, &loop_exit);
6926    append_logical_start(ctx->block);
6927    visit_cf_list(ctx, &loop->body);
6928
6929    //TODO: what if a loop ends with a unconditional or uniformly branched continue and this branch is never taken?
6930    if (!ctx->cf_info.has_branch) {
6931       append_logical_end(ctx->block);
6932       if (ctx->cf_info.exec_potentially_empty) {
6933          /* Discards can result in code running with an empty exec mask.
6934           * This would result in divergent breaks not ever being taken. As a
6935           * workaround, break the loop when the loop mask is empty instead of
6936           * always continuing. */
6937          ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform);
6938
6939          /* create "loop_almost_exit" to avoid critical edges */
6940          unsigned block_idx = ctx->block->index;
6941          Block *loop_almost_exit = ctx->program->create_and_insert_block();
6942          loop_almost_exit->loop_nest_depth = ctx->cf_info.loop_nest_depth;
6943          loop_almost_exit->kind = block_kind_uniform;
6944          bld.reset(loop_almost_exit);
6945          bld.branch(aco_opcode::p_branch);
6946
6947          add_linear_edge(block_idx, loop_almost_exit);
6948          add_linear_edge(loop_almost_exit->index, &loop_exit);
6949
6950          ctx->block = &ctx->program->blocks[block_idx];
6951       } else {
6952          ctx->block->kind |= (block_kind_continue | block_kind_uniform);
6953       }
6954       if (!ctx->cf_info.parent_loop.has_divergent_branch)
6955          add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6956       else
6957          add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]);
6958       bld.reset(ctx->block);
6959       bld.branch(aco_opcode::p_branch);
6960    }
6961
6962    /* fixup phis in loop header from unreachable blocks */
6963    if (ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch) {
6964       bool linear = ctx->cf_info.has_branch;
6965       bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch;
6966       for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) {
6967          if ((logical && instr->opcode == aco_opcode::p_phi) ||
6968              (linear && instr->opcode == aco_opcode::p_linear_phi)) {
6969             /* the last operand should be the one that needs to be removed */
6970             instr->operands.pop_back();
6971          } else if (!is_phi(instr)) {
6972             break;
6973          }
6974       }
6975    }
6976
6977    ctx->cf_info.has_branch = false;
6978
6979    // TODO: if the loop has not a single exit, we must add one °°
6980    /* emit loop successor block */
6981    ctx->block = ctx->program->insert_block(std::move(loop_exit));
6982    append_logical_start(ctx->block);
6983
6984    #if 0
6985    // TODO: check if it is beneficial to not branch on continues
6986    /* trim linear phis in loop header */
6987    for (auto&& instr : loop_entry->instructions) {
6988       if (instr->opcode == aco_opcode::p_linear_phi) {
6989          aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)};
6990          new_phi->definitions[0] = instr->definitions[0];
6991          for (unsigned i = 0; i < new_phi->operands.size(); i++)
6992             new_phi->operands[i] = instr->operands[i];
6993          /* check that the remaining operands are all the same */
6994          for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++)
6995             assert(instr->operands[i].tempId() == instr->operands.back().tempId());
6996          instr.swap(new_phi);
6997       } else if (instr->opcode == aco_opcode::p_phi) {
6998          continue;
6999       } else {
7000          break;
7001       }
7002    }
7003    #endif
7004 }
7005
7006 static void begin_divergent_if_then(isel_context *ctx, if_context *ic, Temp cond)
7007 {
7008    ic->cond = cond;
7009
7010    append_logical_end(ctx->block);
7011    ctx->block->kind |= block_kind_branch;
7012
7013    /* branch to linear then block */
7014    assert(cond.regClass() == s2);
7015    aco_ptr<Pseudo_branch_instruction> branch;
7016    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7017    branch->operands[0] = Operand(cond);
7018    ctx->block->instructions.push_back(std::move(branch));
7019
7020    ic->BB_if_idx = ctx->block->index;
7021    ic->BB_invert = Block();
7022    ic->BB_invert.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7023    /* Invert blocks are intentionally not marked as top level because they
7024     * are not part of the logical cfg. */
7025    ic->BB_invert.kind |= block_kind_invert;
7026    ic->BB_endif = Block();
7027    ic->BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7028    ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level));
7029
7030    ic->exec_potentially_empty_old = ctx->cf_info.exec_potentially_empty;
7031    ic->divergent_old = ctx->cf_info.parent_if.is_divergent;
7032    ctx->cf_info.parent_if.is_divergent = true;
7033    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7034
7035    /** emit logical then block */
7036    Block* BB_then_logical = ctx->program->create_and_insert_block();
7037    BB_then_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7038    add_edge(ic->BB_if_idx, BB_then_logical);
7039    ctx->block = BB_then_logical;
7040    append_logical_start(BB_then_logical);
7041 }
7042
7043 static void begin_divergent_if_else(isel_context *ctx, if_context *ic)
7044 {
7045    Block *BB_then_logical = ctx->block;
7046    append_logical_end(BB_then_logical);
7047     /* branch from logical then block to invert block */
7048    aco_ptr<Pseudo_branch_instruction> branch;
7049    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7050    BB_then_logical->instructions.emplace_back(std::move(branch));
7051    add_linear_edge(BB_then_logical->index, &ic->BB_invert);
7052    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7053       add_logical_edge(BB_then_logical->index, &ic->BB_endif);
7054    BB_then_logical->kind |= block_kind_uniform;
7055    assert(!ctx->cf_info.has_branch);
7056    ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7057    ctx->cf_info.parent_loop.has_divergent_branch = false;
7058
7059    /** emit linear then block */
7060    Block* BB_then_linear = ctx->program->create_and_insert_block();
7061    BB_then_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7062    BB_then_linear->kind |= block_kind_uniform;
7063    add_linear_edge(ic->BB_if_idx, BB_then_linear);
7064    /* branch from linear then block to invert block */
7065    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7066    BB_then_linear->instructions.emplace_back(std::move(branch));
7067    add_linear_edge(BB_then_linear->index, &ic->BB_invert);
7068
7069    /** emit invert merge block */
7070    ctx->block = ctx->program->insert_block(std::move(ic->BB_invert));
7071    ic->invert_idx = ctx->block->index;
7072
7073    /* branch to linear else block (skip else) */
7074    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_nz, Format::PSEUDO_BRANCH, 1, 0));
7075    branch->operands[0] = Operand(ic->cond);
7076    ctx->block->instructions.push_back(std::move(branch));
7077
7078    ic->exec_potentially_empty_old |= ctx->cf_info.exec_potentially_empty;
7079    ctx->cf_info.exec_potentially_empty = false; /* divergent branches use cbranch_execz */
7080
7081    /** emit logical else block */
7082    Block* BB_else_logical = ctx->program->create_and_insert_block();
7083    BB_else_logical->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7084    add_logical_edge(ic->BB_if_idx, BB_else_logical);
7085    add_linear_edge(ic->invert_idx, BB_else_logical);
7086    ctx->block = BB_else_logical;
7087    append_logical_start(BB_else_logical);
7088 }
7089
7090 static void end_divergent_if(isel_context *ctx, if_context *ic)
7091 {
7092    Block *BB_else_logical = ctx->block;
7093    append_logical_end(BB_else_logical);
7094
7095    /* branch from logical else block to endif block */
7096    aco_ptr<Pseudo_branch_instruction> branch;
7097    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7098    BB_else_logical->instructions.emplace_back(std::move(branch));
7099    add_linear_edge(BB_else_logical->index, &ic->BB_endif);
7100    if (!ctx->cf_info.parent_loop.has_divergent_branch)
7101       add_logical_edge(BB_else_logical->index, &ic->BB_endif);
7102    BB_else_logical->kind |= block_kind_uniform;
7103
7104    assert(!ctx->cf_info.has_branch);
7105    ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent;
7106
7107
7108    /** emit linear else block */
7109    Block* BB_else_linear = ctx->program->create_and_insert_block();
7110    BB_else_linear->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7111    BB_else_linear->kind |= block_kind_uniform;
7112    add_linear_edge(ic->invert_idx, BB_else_linear);
7113
7114    /* branch from linear else block to endif block */
7115    branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7116    BB_else_linear->instructions.emplace_back(std::move(branch));
7117    add_linear_edge(BB_else_linear->index, &ic->BB_endif);
7118
7119
7120    /** emit endif merge block */
7121    ctx->block = ctx->program->insert_block(std::move(ic->BB_endif));
7122    append_logical_start(ctx->block);
7123
7124
7125    ctx->cf_info.parent_if.is_divergent = ic->divergent_old;
7126    ctx->cf_info.exec_potentially_empty |= ic->exec_potentially_empty_old;
7127    /* uniform control flow never has an empty exec-mask */
7128    if (!ctx->cf_info.loop_nest_depth && !ctx->cf_info.parent_if.is_divergent)
7129       ctx->cf_info.exec_potentially_empty = false;
7130 }
7131
7132 static void visit_if(isel_context *ctx, nir_if *if_stmt)
7133 {
7134    Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa);
7135    Builder bld(ctx->program, ctx->block);
7136    aco_ptr<Pseudo_branch_instruction> branch;
7137
7138    if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
7139       /**
7140        * Uniform conditionals are represented in the following way*) :
7141        *
7142        * The linear and logical CFG:
7143        *                        BB_IF
7144        *                        /    \
7145        *       BB_THEN (logical)      BB_ELSE (logical)
7146        *                        \    /
7147        *                        BB_ENDIF
7148        *
7149        * *) Exceptions may be due to break and continue statements within loops
7150        *    If a break/continue happens within uniform control flow, it branches
7151        *    to the loop exit/entry block. Otherwise, it branches to the next
7152        *    merge block.
7153        **/
7154       append_logical_end(ctx->block);
7155       ctx->block->kind |= block_kind_uniform;
7156
7157       /* emit branch */
7158       if (cond.regClass() == s2) {
7159          // TODO: in a post-RA optimizer, we could check if the condition is in VCC and omit this instruction
7160          cond = as_uniform_bool(ctx, cond);
7161       }
7162       branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, Format::PSEUDO_BRANCH, 1, 0));
7163       branch->operands[0] = Operand(cond);
7164       branch->operands[0].setFixed(scc);
7165       ctx->block->instructions.emplace_back(std::move(branch));
7166
7167       unsigned BB_if_idx = ctx->block->index;
7168       Block BB_endif = Block();
7169       BB_endif.loop_nest_depth = ctx->cf_info.loop_nest_depth;
7170       BB_endif.kind |= ctx->block->kind & block_kind_top_level;
7171
7172       /** emit then block */
7173       Block* BB_then = ctx->program->create_and_insert_block();
7174       BB_then->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7175       add_edge(BB_if_idx, BB_then);
7176       append_logical_start(BB_then);
7177       ctx->block = BB_then;
7178       visit_cf_list(ctx, &if_stmt->then_list);
7179       BB_then = ctx->block;
7180       bool then_branch = ctx->cf_info.has_branch;
7181       bool then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch;
7182
7183       if (!then_branch) {
7184          append_logical_end(BB_then);
7185          /* branch from then block to endif block */
7186          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7187          BB_then->instructions.emplace_back(std::move(branch));
7188          add_linear_edge(BB_then->index, &BB_endif);
7189          if (!then_branch_divergent)
7190             add_logical_edge(BB_then->index, &BB_endif);
7191          BB_then->kind |= block_kind_uniform;
7192       }
7193
7194       ctx->cf_info.has_branch = false;
7195       ctx->cf_info.parent_loop.has_divergent_branch = false;
7196
7197       /** emit else block */
7198       Block* BB_else = ctx->program->create_and_insert_block();
7199       BB_else->loop_nest_depth = ctx->cf_info.loop_nest_depth;
7200       add_edge(BB_if_idx, BB_else);
7201       append_logical_start(BB_else);
7202       ctx->block = BB_else;
7203       visit_cf_list(ctx, &if_stmt->else_list);
7204       BB_else = ctx->block;
7205
7206       if (!ctx->cf_info.has_branch) {
7207          append_logical_end(BB_else);
7208          /* branch from then block to endif block */
7209          branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 0));
7210          BB_else->instructions.emplace_back(std::move(branch));
7211          add_linear_edge(BB_else->index, &BB_endif);
7212          if (!ctx->cf_info.parent_loop.has_divergent_branch)
7213             add_logical_edge(BB_else->index, &BB_endif);
7214          BB_else->kind |= block_kind_uniform;
7215       }
7216
7217       ctx->cf_info.has_branch &= then_branch;
7218       ctx->cf_info.parent_loop.has_divergent_branch &= then_branch_divergent;
7219
7220       /** emit endif merge block */
7221       if (!ctx->cf_info.has_branch) {
7222          ctx->block = ctx->program->insert_block(std::move(BB_endif));
7223          append_logical_start(ctx->block);
7224       }
7225    } else { /* non-uniform condition */
7226       /**
7227        * To maintain a logical and linear CFG without critical edges,
7228        * non-uniform conditionals are represented in the following way*) :
7229        *
7230        * The linear CFG:
7231        *                        BB_IF
7232        *                        /    \
7233        *       BB_THEN (logical)      BB_THEN (linear)
7234        *                        \    /
7235        *                        BB_INVERT (linear)
7236        *                        /    \
7237        *       BB_ELSE (logical)      BB_ELSE (linear)
7238        *                        \    /
7239        *                        BB_ENDIF
7240        *
7241        * The logical CFG:
7242        *                        BB_IF
7243        *                        /    \
7244        *       BB_THEN (logical)      BB_ELSE (logical)
7245        *                        \    /
7246        *                        BB_ENDIF
7247        *
7248        * *) Exceptions may be due to break and continue statements within loops
7249        **/
7250
7251       if_context ic;
7252
7253       begin_divergent_if_then(ctx, &ic, cond);
7254       visit_cf_list(ctx, &if_stmt->then_list);
7255
7256       begin_divergent_if_else(ctx, &ic);
7257       visit_cf_list(ctx, &if_stmt->else_list);
7258
7259       end_divergent_if(ctx, &ic);
7260    }
7261 }
7262
7263 static void visit_cf_list(isel_context *ctx,
7264                           struct exec_list *list)
7265 {
7266    foreach_list_typed(nir_cf_node, node, node, list) {
7267       switch (node->type) {
7268       case nir_cf_node_block:
7269          visit_block(ctx, nir_cf_node_as_block(node));
7270          break;
7271       case nir_cf_node_if:
7272          visit_if(ctx, nir_cf_node_as_if(node));
7273          break;
7274       case nir_cf_node_loop:
7275          visit_loop(ctx, nir_cf_node_as_loop(node));
7276          break;
7277       default:
7278          unreachable("unimplemented cf list type");
7279       }
7280    }
7281 }
7282
7283 static void export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *next_pos)
7284 {
7285    int offset = ctx->program->info->vs.outinfo.vs_output_param_offset[slot];
7286    uint64_t mask = ctx->vs_output.mask[slot];
7287    if (!is_pos && !mask)
7288       return;
7289    if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED)
7290       return;
7291    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7292    exp->enabled_mask = mask;
7293    for (unsigned i = 0; i < 4; ++i) {
7294       if (mask & (1 << i))
7295          exp->operands[i] = Operand(ctx->vs_output.outputs[slot][i]);
7296       else
7297          exp->operands[i] = Operand(v1);
7298    }
7299    exp->valid_mask = false;
7300    exp->done = false;
7301    exp->compressed = false;
7302    if (is_pos)
7303       exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7304    else
7305       exp->dest = V_008DFC_SQ_EXP_PARAM + offset;
7306    ctx->block->instructions.emplace_back(std::move(exp));
7307 }
7308
7309 static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
7310 {
7311    aco_ptr<Export_instruction> exp{create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)};
7312    exp->enabled_mask = 0;
7313    for (unsigned i = 0; i < 4; ++i)
7314       exp->operands[i] = Operand(v1);
7315    if (ctx->vs_output.mask[VARYING_SLOT_PSIZ]) {
7316       exp->operands[0] = Operand(ctx->vs_output.outputs[VARYING_SLOT_PSIZ][0]);
7317       exp->enabled_mask |= 0x1;
7318    }
7319    if (ctx->vs_output.mask[VARYING_SLOT_LAYER]) {
7320       exp->operands[2] = Operand(ctx->vs_output.outputs[VARYING_SLOT_LAYER][0]);
7321       exp->enabled_mask |= 0x4;
7322    }
7323    if (ctx->vs_output.mask[VARYING_SLOT_VIEWPORT]) {
7324       if (ctx->options->chip_class < GFX9) {
7325          exp->operands[3] = Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]);
7326          exp->enabled_mask |= 0x8;
7327       } else {
7328          Builder bld(ctx->program, ctx->block);
7329
7330          Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(16u),
7331                              Operand(ctx->vs_output.outputs[VARYING_SLOT_VIEWPORT][0]));
7332          if (exp->operands[2].isTemp())
7333             out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]);
7334
7335          exp->operands[2] = Operand(out);
7336          exp->enabled_mask |= 0x4;
7337       }
7338    }
7339    exp->valid_mask = false;
7340    exp->done = false;
7341    exp->compressed = false;
7342    exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
7343    ctx->block->instructions.emplace_back(std::move(exp));
7344 }
7345
7346 static void create_vs_exports(isel_context *ctx)
7347 {
7348    radv_vs_output_info *outinfo = &ctx->program->info->vs.outinfo;
7349
7350    if (outinfo->export_prim_id) {
7351       ctx->vs_output.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1;
7352       ctx->vs_output.outputs[VARYING_SLOT_PRIMITIVE_ID][0] = ctx->vs_prim_id;
7353    }
7354
7355    if (ctx->options->key.has_multiview_view_index) {
7356       ctx->vs_output.mask[VARYING_SLOT_LAYER] |= 0x1;
7357       ctx->vs_output.outputs[VARYING_SLOT_LAYER][0] = as_vgpr(ctx, ctx->view_index);
7358    }
7359
7360    /* the order these position exports are created is important */
7361    int next_pos = 0;
7362    export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos);
7363    if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index) {
7364       export_vs_psiz_layer_viewport(ctx, &next_pos);
7365    }
7366    if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7367       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos);
7368    if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7369       export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos);
7370
7371    if (ctx->options->key.vs_common_out.export_clip_dists) {
7372       if (ctx->num_clip_distances + ctx->num_cull_distances > 0)
7373          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos);
7374       if (ctx->num_clip_distances + ctx->num_cull_distances > 4)
7375          export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos);
7376    }
7377
7378    for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) {
7379       if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER &&
7380           i != VARYING_SLOT_PRIMITIVE_ID)
7381          continue;
7382
7383       export_vs_varying(ctx, i, false, NULL);
7384    }
7385 }
7386
7387 static void emit_stream_output(isel_context *ctx,
7388                                Temp const *so_buffers,
7389                                Temp const *so_write_offset,
7390                                const struct radv_stream_output *output)
7391 {
7392    unsigned num_comps = util_bitcount(output->component_mask);
7393    unsigned loc = output->location;
7394    unsigned buf = output->buffer;
7395    unsigned offset = output->offset;
7396
7397    assert(num_comps && num_comps <= 4);
7398    if (!num_comps || num_comps > 4)
7399       return;
7400
7401    unsigned start = ffs(output->component_mask) - 1;
7402
7403    Temp out[4];
7404    bool all_undef = true;
7405    assert(ctx->stage == vertex_vs);
7406    for (unsigned i = 0; i < num_comps; i++) {
7407       out[i] = ctx->vs_output.outputs[loc][start + i];
7408       all_undef = all_undef && !out[i].id();
7409    }
7410    if (all_undef)
7411       return;
7412
7413    Temp write_data = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_comps)};
7414    aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_comps, 1)};
7415    for (unsigned i = 0; i < num_comps; ++i)
7416       vec->operands[i] = (ctx->vs_output.mask[loc] & 1 << i) ? Operand(out[i]) : Operand(0u);
7417    vec->definitions[0] = Definition(write_data);
7418    ctx->block->instructions.emplace_back(std::move(vec));
7419
7420    aco_opcode opcode;
7421    switch (num_comps) {
7422    case 1:
7423       opcode = aco_opcode::buffer_store_dword;
7424       break;
7425    case 2:
7426       opcode = aco_opcode::buffer_store_dwordx2;
7427       break;
7428    case 3:
7429       opcode = aco_opcode::buffer_store_dwordx3;
7430       break;
7431    case 4:
7432       opcode = aco_opcode::buffer_store_dwordx4;
7433       break;
7434    }
7435
7436    aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)};
7437    store->operands[0] = Operand(so_write_offset[buf]);
7438    store->operands[1] = Operand(so_buffers[buf]);
7439    store->operands[2] = Operand((uint32_t) 0);
7440    store->operands[3] = Operand(write_data);
7441    if (offset > 4095) {
7442       /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */
7443       Builder bld(ctx->program, ctx->block);
7444       store->operands[0] = bld.vadd32(bld.def(v1), Operand(offset), Operand(so_write_offset[buf]));
7445    } else {
7446       store->offset = offset;
7447    }
7448    store->offen = true;
7449    store->glc = true;
7450    store->slc = true;
7451    store->can_reorder = true;
7452    ctx->block->instructions.emplace_back(std::move(store));
7453 }
7454
7455 static void emit_streamout(isel_context *ctx, unsigned stream)
7456 {
7457    Builder bld(ctx->program, ctx->block);
7458
7459    Temp so_buffers[4];
7460    Temp buf_ptr = convert_pointer_to_64_bit(ctx, ctx->streamout_buffers);
7461    for (unsigned i = 0; i < 4; i++) {
7462       unsigned stride = ctx->program->info->so.strides[i];
7463       if (!stride)
7464          continue;
7465
7466       so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, Operand(i * 16u));
7467    }
7468
7469    Temp so_vtx_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
7470                                 ctx->streamout_config, Operand(0x70010u));
7471
7472    Temp tid = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7473                        bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7474
7475    Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(s2), so_vtx_count, tid);
7476
7477    if_context ic;
7478    begin_divergent_if_then(ctx, &ic, can_emit);
7479
7480    bld.reset(ctx->block);
7481
7482    Temp so_write_index = bld.vadd32(bld.def(v1), ctx->streamout_write_idx, tid);
7483
7484    Temp so_write_offset[4];
7485
7486    for (unsigned i = 0; i < 4; i++) {
7487       unsigned stride = ctx->program->info->so.strides[i];
7488       if (!stride)
7489          continue;
7490
7491       if (stride == 1) {
7492          Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
7493                                 ctx->streamout_write_idx, ctx->streamout_offset[i]);
7494          Temp new_offset = bld.vadd32(bld.def(v1), offset, tid);
7495
7496          so_write_offset[i] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), new_offset);
7497       } else {
7498          Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u);
7499          Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand(4u), ctx->streamout_offset[i]);
7500          so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2);
7501       }
7502    }
7503
7504    for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) {
7505       struct radv_stream_output *output =
7506          &ctx->program->info->so.outputs[i];
7507       if (stream != output->stream)
7508          continue;
7509
7510       emit_stream_output(ctx, so_buffers, so_write_offset, output);
7511    }
7512
7513    begin_divergent_if_else(ctx, &ic);
7514    end_divergent_if(ctx, &ic);
7515 }
7516
7517 } /* end namespace */
7518
7519 void handle_bc_optimize(isel_context *ctx)
7520 {
7521    /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */
7522    Builder bld(ctx->program, ctx->block);
7523    uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena;
7524    bool uses_center = G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena);
7525    bool uses_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena);
7526    if (uses_center && uses_centroid) {
7527       Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(s2)), ctx->prim_mask, Operand(0u));
7528
7529       if (G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena)) {
7530          for (unsigned i = 0; i < 2; i++) {
7531             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7532                                       ctx->fs_inputs[fs_input::persp_centroid_p1 + i],
7533                                       ctx->fs_inputs[fs_input::persp_center_p1 + i],
7534                                       sel);
7535             ctx->fs_inputs[fs_input::persp_centroid_p1 + i] = new_coord;
7536          }
7537       }
7538
7539       if (G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena)) {
7540          for (unsigned i = 0; i < 2; i++) {
7541             Temp new_coord = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1),
7542                                       ctx->fs_inputs[fs_input::linear_centroid_p1 + i],
7543                                       ctx->fs_inputs[fs_input::linear_center_p1 + i],
7544                                       sel);
7545             ctx->fs_inputs[fs_input::linear_centroid_p1 + i] = new_coord;
7546          }
7547       }
7548    }
7549 }
7550
7551 void select_program(Program *program,
7552                     unsigned shader_count,
7553                     struct nir_shader *const *shaders,
7554                     ac_shader_config* config,
7555                     struct radv_shader_info *info,
7556                     struct radv_nir_compiler_options *options)
7557 {
7558    isel_context ctx = setup_isel_context(program, shader_count, shaders, config, info, options);
7559
7560    for (unsigned i = 0; i < shader_count; i++) {
7561       nir_shader *nir = shaders[i];
7562       init_context(&ctx, nir);
7563
7564       if (!i) {
7565          add_startpgm(&ctx); /* needs to be after init_context() for FS */
7566          append_logical_start(ctx.block);
7567       }
7568
7569       if_context ic;
7570       if (shader_count >= 2) {
7571          Builder bld(ctx.program, ctx.block);
7572          Temp count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), ctx.merged_wave_info, Operand((8u << 16) | (i * 8u)));
7573          Temp thread_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1),
7574                                    bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)));
7575          Temp cond = bld.vopc(aco_opcode::v_cmp_gt_u32, bld.hint_vcc(bld.def(s2)), count, thread_id);
7576
7577          begin_divergent_if_then(&ctx, &ic, cond);
7578       }
7579
7580       if (i) {
7581          Builder bld(ctx.program, ctx.block);
7582          bld.barrier(aco_opcode::p_memory_barrier_shared); //TODO: different barriers are needed for different stages
7583          bld.sopp(aco_opcode::s_barrier);
7584       }
7585
7586       if (ctx.stage == fragment_fs)
7587          handle_bc_optimize(&ctx);
7588
7589       nir_function_impl *func = nir_shader_get_entrypoint(nir);
7590       visit_cf_list(&ctx, &func->body);
7591
7592       if (ctx.program->info->so.num_outputs/*&& !ctx->is_gs_copy_shader */)
7593          emit_streamout(&ctx, 0);
7594
7595       if (ctx.stage == vertex_vs)
7596          create_vs_exports(&ctx);
7597
7598       if (shader_count >= 2) {
7599          begin_divergent_if_else(&ctx, &ic);
7600          end_divergent_if(&ctx, &ic);
7601       }
7602
7603       ralloc_free(ctx.divergent_vals);
7604    }
7605
7606    append_logical_end(ctx.block);
7607    ctx.block->kind |= block_kind_uniform;
7608    Builder bld(ctx.program, ctx.block);
7609    if (ctx.program->wb_smem_l1_on_end)
7610       bld.smem(aco_opcode::s_dcache_wb, false);
7611    bld.sopp(aco_opcode::s_endpgm);
7612
7613    /* cleanup CFG */
7614    for (Block& BB : program->blocks) {
7615       for (unsigned idx : BB.linear_preds)
7616          program->blocks[idx].linear_succs.emplace_back(BB.index);
7617       for (unsigned idx : BB.logical_preds)
7618          program->blocks[idx].logical_succs.emplace_back(BB.index);
7619    }
7620 }
7621 }