X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_instruction_selection.cpp;h=80ea1e133d661420025877f3f024807687d4555e;hb=626081fe4bf653a18cff9e25e6da2636bc58774a;hp=beaebdb125700bcd2b71c360e49f94c672bad93b;hpb=3ee3ad561a29d5429309571db489f95e4ccaec5b;p=mesa.git

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index beaebdb1257..80ea1e133d6 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -136,8 +136,11 @@ Temp emit_mbcnt(isel_context *ctx, Definition dst,
 
    if (ctx->program->wave_size == 32) {
       return thread_id_lo;
+   } else if (ctx->program->chip_class <= GFX7) {
+      Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      return thread_id_hi;
    } else {
-      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
       return thread_id_hi;
    }
 }
@@ -169,33 +172,69 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
    if (index.regClass() == s1)
       return bld.readlane(bld.def(s1), data, index);
 
-   Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
-
-   /* Currently not implemented on GFX6-7 */
-   assert(ctx->options->chip_class >= GFX8);
-
-   if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
+   if (ctx->options->chip_class <= GFX7) {
+      /* GFX6-7: there is no bpermute instruction */
+      Operand index_op(index);
+      Operand input_data(data);
+      index_op.setLateKill(true);
+      input_data.setLateKill(true);
+
+      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
+   } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
+      /* GFX10 wave64 mode: emulate full-wave bpermute */
+      if (!ctx->has_gfx10_wave64_bpermute) {
+         ctx->has_gfx10_wave64_bpermute = true;
+         ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
+         ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
+      }
+
+      Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
+      Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
+      Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
+      Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
+      Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
+      Operand input_data(data);
+
+      index_x4.setLateKill(true);
+      input_data.setLateKill(true);
+      same_half.setLateKill(true);
+
+      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
+   } else {
+      /* GFX8-9 or GFX10 wave32: bpermute works normally */
+      Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
       return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
    }
+}
 
-   /* GFX10, wave64 mode:
-    * The bpermute instruction is limited to half-wave operation, which means that it can't
-    * properly support subgroup shuffle like older generations (or wave32 mode), so we
-    * emulate it here.
-    */
-   if (!ctx->has_gfx10_wave64_bpermute) {
-      ctx->has_gfx10_wave64_bpermute = true;
-      ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
-      ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
-   }
+static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
+{
+   if (ctx->options->chip_class >= GFX8) {
+      unsigned and_mask = mask & 0x1f;
+      unsigned or_mask = (mask >> 5) & 0x1f;
+      unsigned xor_mask = (mask >> 10) & 0x1f;
+
+      uint16_t dpp_ctrl = 0xffff;
+
+      // TODO: we could use DPP8 for some swizzles
+      if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
+         unsigned res[4] = {0, 1, 2, 3};
+         for (unsigned i = 0; i < 4; i++)
+            res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
+         dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
+         dpp_ctrl = dpp_row_rr(8);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
+         dpp_ctrl = dpp_row_mirror;
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
+         dpp_ctrl = dpp_row_half_mirror;
+      }
 
-   Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
-   Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
-   Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
-   Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
+      if (dpp_ctrl != 0xffff)
+         return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
+   }
 
-   return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
-                        bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
+   return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
 }
 
 Temp as_vgpr(isel_context *ctx, Temp val)
@@ -396,7 +435,7 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
       bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
       hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
       if (select != Temp())
-         hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
+         hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
       lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
       Temp mid = bld.tmp(s1);
       lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
@@ -407,38 +446,67 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
    }
 }
 
-/* this function trims subdword vectors:
- * if dst is vgpr - split the src and create a shrunk version according to the mask.
- * if dst is sgpr - split the src, but move the original to sgpr. */
-void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
+void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
 {
-   assert(vec_src.type() == RegType::vgpr);
-   emit_split_vector(ctx, vec_src, num_components);
-
    Builder bld(ctx->program, ctx->block);
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
-   unsigned component_size = vec_src.bytes() / num_components;
-   RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+   if (offset.isTemp()) {
+      Temp tmp[4] = {vec, vec, vec, vec};
 
-   unsigned k = 0;
-   for (unsigned i = 0; i < num_components; i++) {
-      if (mask & (1 << i))
-         elems[k++] = emit_extract_vector(ctx, vec_src, i, rc);
+      if (vec.size() == 4) {
+         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
+      } else if (vec.size() == 3) {
+         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
+      } else if (vec.size() == 2) {
+         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
+      }
+      for (unsigned i = 0; i < dst.size(); i++)
+         tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
+
+      vec = tmp[0];
+      if (dst.size() == 2)
+         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
+
+      offset = Operand(0u);
+   }
+
+   unsigned num_components = vec.bytes() / component_size;
+   if (vec.regClass() == dst.regClass()) {
+      assert(offset.constantValue() == 0);
+      bld.copy(Definition(dst), vec);
+      emit_split_vector(ctx, dst, num_components);
+      return;
    }
 
+   emit_split_vector(ctx, vec, num_components);
+   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+   RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+
+   assert(offset.constantValue() % component_size == 0);
+   unsigned skip = offset.constantValue() / component_size;
+   for (unsigned i = skip; i < num_components; i++)
+      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
+
+   /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
    if (dst.type() == RegType::vgpr) {
-      assert(dst.bytes() == k * component_size);
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)};
-      for (unsigned i = 0; i < k; i++)
-         vec->operands[i] = Operand(elems[i]);
-      vec->definitions[0] = Definition(dst);
-      bld.insert(std::move(vec));
+      num_components = dst.bytes() / component_size;
+      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+      for (unsigned i = 0; i < num_components; i++)
+         create_vec->operands[i] = Operand(elems[i]);
+      create_vec->definitions[0] = Definition(dst);
+      bld.insert(std::move(create_vec));
+
+   /* if dst is sgpr - split the src, but move the original to sgpr. */
+   } else if (skip) {
+      vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
+      byte_align_scalar(ctx, vec, offset, dst);
    } else {
-      // TODO: alignbyte if mask doesn't start with 1?
-      assert(mask & 1);
-      assert(dst.size() == vec_src.size());
-      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
+      assert(dst.size() == vec.size());
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
    }
+
    ctx->allocated_vec.emplace(dst.id(), elems);
 }
 
@@ -547,6 +615,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
    sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
    sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
    sop2->definitions[0] = Definition(dst);
+   if (instr->no_unsigned_wrap)
+      sop2->definitions[0].setNUW(true);
    if (writes_scc)
       sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
    ctx->block->instructions.emplace_back(std::move(sop2));
@@ -556,6 +626,8 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
                            bool commutative, bool swap_srcs=false, bool flush_denorms = false)
 {
    Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+
    Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
    Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
    if (src1.type() == RegType::sgpr) {
@@ -577,6 +649,31 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
    }
 }
 
+void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
+                                   aco_opcode op, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+   if (src1.type() == RegType::sgpr) {
+      assert(src0.type() == RegType::vgpr);
+      std::swap(src0, src1);
+   }
+
+   Temp src00 = bld.tmp(src0.type(), 1);
+   Temp src01 = bld.tmp(src0.type(), 1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+   Temp src10 = bld.tmp(v1);
+   Temp src11 = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
+   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+}
+
 void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
                             bool flush_denorms = false)
 {
@@ -594,6 +691,7 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
       src2 = as_vgpr(ctx, src2);
 
    Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
    if (flush_denorms && ctx->program->chip_class < GFX9) {
       assert(dst.size() == 1);
       Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
@@ -606,7 +704,12 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
 void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
 {
    Builder bld(ctx->program, ctx->block);
-   bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
+   bld.is_precise = instr->exact;
+   if (dst.type() == RegType::sgpr)
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
+   else
+      bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
 }
 
 void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
@@ -711,9 +814,8 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
 {
    aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
    aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
-   bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
    bool use_valu = s_op == aco_opcode::num_opcodes ||
-                   divergent_vals ||
+                   nir_dest_is_divergent(instr->dest.dest) ||
                    ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
                    ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
    aco_opcode op = use_valu ? v_op : s_op;
@@ -750,18 +852,12 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
 
    if (dst.type() == RegType::vgpr) {
       aco_ptr<Instruction> bcsel;
-      if (dst.regClass() == v2b) {
-         then = as_vgpr(ctx, then);
-         els = as_vgpr(ctx, els);
-
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), els, then, cond);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
-      } else if (dst.regClass() == v1) {
+      if (dst.size() == 1) {
          then = as_vgpr(ctx, then);
          els = as_vgpr(ctx, els);
 
          bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
-      } else if (dst.regClass() == v2) {
+      } else if (dst.size() == 2) {
          Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
          Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
@@ -785,7 +881,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
       assert(els.regClass() == bld.lm);
    }
 
-   if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
+   if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
       if (dst.regClass() == s1 || dst.regClass() == s2) {
          assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
          assert(dst.size() == then.size());
@@ -884,7 +980,8 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
 
    /* Extract the exponent and compute the unbiased value. */
-   Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val);
+   Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
+   exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
 
    /* Extract the fractional part. */
    Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
@@ -900,7 +997,7 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
 
    /* Get the sign bit. */
-   Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi);
+   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
 
    /* Decide the operation to apply depending on the unbiased exponent. */
    Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
@@ -918,7 +1015,8 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    if (ctx->options->chip_class >= GFX7)
       return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
 
-   /* GFX6 doesn't support V_FLOOR_F64, lower it. */
+   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
+    * lowered at NIR level for precision reasons). */
    Temp src0 = as_vgpr(ctx, val);
 
    Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
@@ -944,7 +1042,7 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
    return add->definitions[0].getTemp();
 }
 
-Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
+Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
    if (!dst.id()) {
       if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
          dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
@@ -967,7 +1065,7 @@ Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, b
          bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
       else
          bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
-   } else {
+   } else if (ctx->options->chip_class >= GFX8) {
       assert(src_bits != 8 || src.regClass() == v1b);
       assert(src_bits != 16 || src.regClass() == v2b);
       aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
@@ -979,6 +1077,10 @@ Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, b
          sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
       sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
       bld.insert(std::move(sdwa));
+   } else {
+      assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
+      aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
+      bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
    }
 
    if (dst_bits == 64) {
@@ -1005,6 +1107,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       abort();
    }
    Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
    Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
    switch(instr->op) {
    case nir_op_vec2:
@@ -1062,13 +1165,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
             bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
          else
             unreachable("wrong src register class for nir_op_imov");
-      } else if (dst.regClass() == v1) {
-         bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
-      } else if (dst.regClass() == v2) {
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
       } else {
-         nir_print_instr(&instr->instr, stderr);
-         unreachable("Should have been lowered to scalar.");
+         if (dst.regClass() == v1)
+            bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
+         else if (dst.regClass() == v1b ||
+                  dst.regClass() == v2b ||
+                  dst.regClass() == v2)
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
+         else
+            unreachable("wrong src register class for nir_op_imov");
       }
       break;
    }
@@ -1082,6 +1187,12 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
       } else if (dst.regClass() == v1) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
+      } else if (dst.regClass() == v2) {
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
+         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
       } else if (dst.type() == RegType::sgpr) {
          aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
          bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
@@ -1217,6 +1328,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          emit_boolean_logic(ctx, instr, Builder::s_or, dst);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
       } else if (dst.regClass() == s1) {
          emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
       } else if (dst.regClass() == s2) {
@@ -1233,6 +1346,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          emit_boolean_logic(ctx, instr, Builder::s_and, dst);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
       } else if (dst.regClass() == s1) {
          emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
       } else if (dst.regClass() == s2) {
@@ -1249,6 +1364,8 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
       } else if (dst.regClass() == s1) {
          emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
       } else if (dst.regClass() == s2) {
@@ -1601,9 +1718,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src0 = get_alu_src(ctx, instr->src[0]);
       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
       } else if (dst.regClass() == v2) {
@@ -1619,9 +1734,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src0 = get_alu_src(ctx, instr->src[0]);
       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
       } else if (dst.regClass() == v2) {
@@ -1637,12 +1750,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src0 = get_alu_src(ctx, instr->src[0]);
       Temp src1 = get_alu_src(ctx, instr->src[1]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false);
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
          else
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
       } else if (dst.regClass() == v1) {
          if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
             emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
@@ -1665,9 +1776,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
       if (dst.regClass() == v2b) {
          // TODO: check fp_mode.must_flush_denorms16_64
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
       } else if (dst.regClass() == v2) {
@@ -1689,9 +1798,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
       if (dst.regClass() == v2b) {
          // TODO: check fp_mode.must_flush_denorms16_64
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
       } else if (dst.regClass() == v1) {
          emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
       } else if (dst.regClass() == v2) {
@@ -1710,9 +1817,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_fmax3: {
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
       } else if (dst.regClass() == v1) {
          emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
       } else {
@@ -1724,9 +1829,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_fmin3: {
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
       } else if (dst.regClass() == v1) {
          emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
       } else {
@@ -1738,9 +1841,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_fmed3: {
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
       } else if (dst.regClass() == v1) {
          emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
       } else {
@@ -1839,11 +1940,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_frsq: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_rsq(ctx, bld, Definition(dst), src);
       } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
          emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
       } else {
          fprintf(stderr, "Unimplemented NIR instr bit size: ");
@@ -1855,8 +1956,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fneg: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src));
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
+         bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
       } else if (dst.regClass() == v1) {
          if (ctx->block->fp_mode.must_flush_denorms32)
             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
@@ -1878,8 +1980,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fabs: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src));
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
+         bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
       } else if (dst.regClass() == v1) {
          if (ctx->block->fp_mode.must_flush_denorms32)
             src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
@@ -1901,8 +2004,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fsat: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), Operand(0x3f800000u), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
       } else if (dst.regClass() == v1) {
          bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
          /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
@@ -1921,8 +2023,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_flog2: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_log2(ctx, bld, Definition(dst), src);
       } else {
@@ -1935,11 +2036,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_frcp: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_rcp(ctx, bld, Definition(dst), src);
       } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
          emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
       } else {
          fprintf(stderr, "Unimplemented NIR instr bit size: ");
@@ -1950,9 +2051,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_fexp2: {
       if (dst.regClass() == v2b) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
       } else {
@@ -1965,11 +2064,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fsqrt: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_sqrt(ctx, bld, Definition(dst), src);
       } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
          emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
       } else {
          fprintf(stderr, "Unimplemented NIR instr bit size: ");
@@ -1980,9 +2079,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_ffract: {
       if (dst.regClass() == v2b) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
       } else if (dst.regClass() == v2) {
@@ -1997,8 +2094,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_ffloor: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
       } else if (dst.regClass() == v2) {
@@ -2013,8 +2109,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fceil: {
       Temp src0 = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
       } else if (dst.regClass() == v2) {
@@ -2044,8 +2139,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_ftrunc: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
       } else if (dst.regClass() == v2) {
@@ -2060,8 +2154,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fround_even: {
       Temp src0 = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
       } else if (dst.regClass() == v1) {
          emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
       } else if (dst.regClass() == v2) {
@@ -2102,13 +2195,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_fcos: {
       Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
       aco_ptr<Instruction> norm;
-      Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
       if (dst.regClass() == v2b) {
+         Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
          Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
          aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
-         tmp = bld.vop1(opcode, bld.def(v1), tmp);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop1(opcode, Definition(dst), tmp);
       } else if (dst.regClass() == v1) {
+         Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
          Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
 
          /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
@@ -2128,9 +2221,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src0 = get_alu_src(ctx, instr->src[0]);
       Temp src1 = get_alu_src(ctx, instr->src[1]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
       } else if (dst.regClass() == v1) {
          bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
       } else if (dst.regClass() == v2) {
@@ -2145,8 +2236,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_frexp_sig: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
       } else if (dst.regClass() == v1) {
          bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
       } else if (dst.regClass() == v2) {
@@ -2163,7 +2253,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (instr->src[0].src.ssa->bit_size == 16) {
          Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
          tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
-         convert_int(bld, tmp, 8, 32, true, dst);
+         convert_int(ctx, bld, tmp, 8, 32, true, dst);
       } else if (instr->src[0].src.ssa->bit_size == 32) {
          bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
       } else if (instr->src[0].src.ssa->bit_size == 64) {
@@ -2183,8 +2273,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
          cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), minus_one, src, cond);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
       } else if (dst.regClass() == v1) {
          Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
          src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
@@ -2212,16 +2301,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size == 64)
          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
+         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
+          * keep value numbering and the scheduler simpler.
+          */
+         bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
+      else
+         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
       break;
    }
    case nir_op_f2f16_rtz: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size == 64)
          src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u));
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+      bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
       break;
    }
    case nir_op_f2f32: {
@@ -2247,16 +2340,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       assert(dst.regClass() == v2b);
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size == 8)
-         src = convert_int(bld, src, 8, 16, true);
-      Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         src = convert_int(ctx, bld, src, 8, 16, true);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         src = convert_int(ctx, bld, src, 64, 32, false);
+      bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
       break;
    }
    case nir_op_i2f32: {
       assert(dst.size() == 1);
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size <= 16)
-         src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+         src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
       bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
       break;
    }
@@ -2264,7 +2358,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (instr->src[0].src.ssa->bit_size <= 32) {
          Temp src = get_alu_src(ctx, instr->src[0]);
          if (instr->src[0].src.ssa->bit_size <= 16)
-            src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
          bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
       } else if (instr->src[0].src.ssa->bit_size == 64) {
          Temp src = get_alu_src(ctx, instr->src[0]);
@@ -2287,20 +2381,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       assert(dst.regClass() == v2b);
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size == 8)
-         src = convert_int(bld, src, 8, 16, false);
-      Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         src = convert_int(ctx, bld, src, 8, 16, false);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         src = convert_int(ctx, bld, src, 64, 32, false);
+      bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
       break;
    }
    case nir_op_u2f32: {
       assert(dst.size() == 1);
       Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size == 8) {
-         //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment
          bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
       } else {
          if (instr->src[0].src.ssa->bit_size == 16)
-            src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
          bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
       }
       break;
@@ -2309,7 +2403,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       if (instr->src[0].src.ssa->bit_size <= 32) {
          Temp src = get_alu_src(ctx, instr->src[0]);
          if (instr->src[0].src.ssa->bit_size <= 16)
-            src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, false);
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
          bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
       } else if (instr->src[0].src.ssa->bit_size == 64) {
          Temp src = get_alu_src(ctx, instr->src[0]);
@@ -2329,34 +2423,22 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_f2i8:
    case nir_op_f2i16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size == 16)
-         src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
       else if (instr->src[0].src.ssa->bit_size == 32)
-         src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
-      else
-         src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
-
-      if (dst.type() == RegType::vgpr)
-         bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
       else
-         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
       break;
    }
    case nir_op_f2u8:
    case nir_op_f2u16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
       if (instr->src[0].src.ssa->bit_size == 16)
-         src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
       else if (instr->src[0].src.ssa->bit_size == 32)
-         src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
       else
-         src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
-
-      if (dst.type() == RegType::vgpr)
-         bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
-      else
-         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
       break;
    }
    case nir_op_f2i32: {
@@ -2370,19 +2452,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                        bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
          }
       } else if (instr->src[0].src.ssa->bit_size == 32) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
       } else if (instr->src[0].src.ssa->bit_size == 64) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
       } else {
          fprintf(stderr, "Unimplemented NIR instr bit size: ");
          nir_print_instr(&instr->instr, stderr);
@@ -2401,19 +2473,9 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                        bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
          }
       } else if (instr->src[0].src.ssa->bit_size == 32) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
       } else if (instr->src[0].src.ssa->bit_size == 64) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
       } else {
          fprintf(stderr, "Unimplemented NIR instr bit size: ");
          nir_print_instr(&instr->instr, stderr);
@@ -2581,8 +2643,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
          bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
       } else if (dst.regClass() == v2b) {
          Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
       } else {
          unreachable("Wrong destination register class for nir_op_b2f16.");
       }
@@ -2622,7 +2683,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_i2i16:
    case nir_op_i2i32:
    case nir_op_i2i64: {
-      convert_int(bld, get_alu_src(ctx, instr->src[0]),
+      convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
       break;
    }
@@ -2630,23 +2691,30 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    case nir_op_u2u16:
    case nir_op_u2u32:
    case nir_op_u2u64: {
-      convert_int(bld, get_alu_src(ctx, instr->src[0]),
+      convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
                   instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
       break;
    }
    case nir_op_b2b32:
-   case nir_op_b2i32: {
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32:
+   case nir_op_b2i64: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       assert(src.regClass() == bld.lm);
 
-      if (dst.regClass() == s1) {
+      Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
+      if (tmp.regClass() == s1) {
          // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
-         bool_to_scalar_condition(ctx, src, dst);
-      } else if (dst.regClass() == v1) {
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
+         bool_to_scalar_condition(ctx, src, tmp);
+      } else if (tmp.type() == RegType::vgpr) {
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
       } else {
          unreachable("Invalid register class for b2i32");
       }
+
+      if (tmp != dst)
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
       break;
    }
    case nir_op_b2b1:
@@ -2735,7 +2803,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_unpack_half_2x16_split_x: {
       if (dst.regClass() == v1) {
-         Builder bld(ctx->program, ctx->block);
          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
       } else {
          fprintf(stderr, "Unimplemented NIR instr bit size: ");
@@ -2746,7 +2813,6 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
    }
    case nir_op_unpack_half_2x16_split_y: {
       if (dst.regClass() == v1) {
-         Builder bld(ctx->program, ctx->block);
          /* TODO: use SDWA here */
          bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
                   bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
@@ -3058,35 +3124,6 @@ uint32_t widen_mask(uint32_t mask, unsigned multiplier)
    return new_mask;
 }
 
-void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst)
-{
-   Builder bld(ctx->program, ctx->block);
-   if (offset.isTemp()) {
-      Temp tmp[3] = {vec, vec, vec};
-
-      if (vec.size() == 3) {
-         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
-      } else if (vec.size() == 2) {
-         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
-         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
-      }
-      for (unsigned i = 0; i < dst.size(); i++)
-         tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
-
-      vec = tmp[0];
-      if (dst.size() == 2)
-         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
-
-      offset = Operand(0u);
-   }
-
-   if (vec.bytes() == dst.bytes() && offset.constantValue() == 0)
-      bld.copy(Definition(dst), vec);
-   else
-      trim_subdword_vector(ctx, vec, dst, vec.bytes(), ((1 << dst.bytes()) - 1) << offset.constantValue());
-}
-
 struct LoadEmitInfo {
    Operand offset;
    Temp dst;
@@ -3131,7 +3168,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
       int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
 
       if (byte_align) {
-         if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) {
+         if ((bytes_needed > 2 ||
+              (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
+              !supports_8bit_16bit_loads) && byte_align_loads) {
             if (info->component_stride) {
                assert(supports_8bit_16bit_loads && "unimplemented");
                bytes_needed = 2;
@@ -3193,7 +3232,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
 
       /* align offset down if needed */
       Operand aligned_offset = offset;
+      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
       if (need_to_align_offset) {
+         align = 4;
          Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
          if (offset.isConstant()) {
             aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
@@ -3213,12 +3254,18 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
       Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
                                 bld.copy(bld.def(s1), aligned_offset);
 
-      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
       Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
                           reduced_const_offset, byte_align ? Temp() : info->dst);
 
+      /* the callback wrote directly to dst */
+      if (val == info->dst) {
+         assert(num_vals == 0);
+         emit_split_vector(ctx, info->dst, info->num_components);
+         return;
+      }
+
       /* shift result right if needed */
-      if (byte_align) {
+      if (info->component_size < 4 && byte_align_loads) {
          Operand align((uint32_t)byte_align);
          if (byte_align == -1) {
             if (offset.isConstant())
@@ -3229,15 +3276,12 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
                align = offset;
          }
 
-         if (align.isTemp() || align.constantValue()) {
-            assert(val.bytes() >= load_size && "unimplemented");
-            Temp new_val = bld.tmp(RegClass::get(val.type(), load_size));
-            if (val.type() == RegType::sgpr)
-               byte_align_scalar(ctx, val, align, new_val);
-            else
-               byte_align_vector(ctx, val, align, new_val);
-            val = new_val;
-         }
+         assert(val.bytes() >= load_size && "unimplemented");
+         if (val.type() == RegType::sgpr)
+            byte_align_scalar(ctx, val, align, info->dst);
+         else
+            byte_align_vector(ctx, val, align, info->dst, component_size);
+         return;
       }
 
       /* add result to list and advance */
@@ -3253,13 +3297,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
       vals[num_vals++] = val;
    }
 
-   /* the callback wrote directly to dst */
-   if (vals[0] == info->dst) {
-      assert(num_vals == 1);
-      emit_split_vector(ctx, info->dst, info->num_components);
-      return;
-   }
-
    /* create array of components */
    unsigned components_split = 0;
    std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
@@ -3278,7 +3315,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
       if (num_tmps > 1) {
          aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
             aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
-         for (unsigned i = 0; i < num_vals; i++)
+         for (unsigned i = 0; i < num_tmps; i++)
             vec->operands[i] = Operand(tmp[i]);
          tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
          vec->definitions[0] = Definition(tmp[0]);
@@ -3477,10 +3514,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
 
    unsigned bytes_size = 0;
    aco_opcode op;
-   if (bytes_needed == 1) {
+   if (bytes_needed == 1 || align_ % 2) {
       bytes_size = 1;
       op = aco_opcode::buffer_load_ubyte;
-   } else if (bytes_needed == 2) {
+   } else if (bytes_needed == 2 || align_ % 4) {
       bytes_size = 2;
       op = aco_opcode::buffer_load_ushort;
    } else if (bytes_needed <= 4) {
@@ -3506,18 +3543,17 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
    mubuf->barrier = info->barrier;
    mubuf->can_reorder = info->can_reorder;
    mubuf->offset = const_offset;
-   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+   mubuf->swizzled = info->swizzle_component_size != 0;
+   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
    Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
    mubuf->definitions[0] = Definition(val);
    bld.insert(std::move(mubuf));
 
-   if (bytes_size < 4)
-      val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
-
    return val;
 }
 
 static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
 
 Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
 {
@@ -3586,9 +3622,6 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
       bld.insert(std::move(flat));
    }
 
-   if (bytes_size < 4)
-      val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, bytes_size)), val, Operand(0u));
-
    return val;
 }
 
@@ -3645,13 +3678,15 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
       /* use allocated_vec if possible */
       auto it = ctx->allocated_vec.find(src.id());
       if (it != ctx->allocated_vec.end()) {
-         unsigned total_size = 0;
-         for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
-            total_size += it->second[i].bytes();
-         if (total_size != src.bytes())
+         if (!it->second[0].id())
             goto split;
-
          unsigned elem_size = it->second[0].bytes();
+         assert(src.bytes() % elem_size == 0);
+
+         for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+            if (!it->second[i].id())
+               goto split;
+         }
 
          for (unsigned i = 0; i < count; i++) {
             if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
@@ -3683,10 +3718,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
       }
    }
 
+   split:
+
    if (dst_type == RegType::sgpr)
       src = bld.as_uniform(src);
 
-   split:
    /* just split it */
    aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
    split->operands[0] = Operand(src);
@@ -3886,10 +3922,10 @@ void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem
 
       /* dword or larger stores have to be dword-aligned */
       unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
-      unsigned align_offset = instr ? nir_intrinsic_align_mul(instr) : 0;
-      bool dword_aligned = (align_offset + offset) % 4 == 0 && align_mul % 4 == 0;
-      if (bytes >= 4 && !dword_aligned)
-         bytes = MIN2(bytes, 2);
+      unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
+      bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
+      if (!dword_aligned)
+         bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
 
       advance_write_mask(&todo, offset, bytes);
       write_count_with_skips++;
@@ -3963,7 +3999,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
 }
 
 void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
-                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false,
+                             bool swizzled = false)
 {
    assert(vdata.id());
    assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
@@ -3976,8 +4013,9 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
    Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
    Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
    Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
-                                 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+                                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+                                 /* dlc*/ false, /* slc */ slc);
 
    static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
 }
@@ -3987,7 +4025,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
                                    bool allow_combining = true, bool reorder = true, bool slc = false)
 {
    Builder bld(ctx->program, ctx->block);
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
    assert(write_mask);
    write_mask = widen_mask(write_mask, elem_size_bytes);
 
@@ -3999,7 +4037,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
 
    for (unsigned i = 0; i < write_count; i++) {
       unsigned const_offset = offsets[i] + base_const_offset;
-      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc, !allow_combining);
    }
 }
 
@@ -4007,8 +4045,8 @@ void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset,
                      unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
                      unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
 {
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
-   assert((num_components * elem_size_bytes / 4) == dst.size());
+   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
+   assert((num_components * elem_size_bytes) == dst.bytes());
    assert(!!stride != allow_combining);
 
    Builder bld(ctx->program, ctx->block);
@@ -4153,11 +4191,9 @@ std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intri
    Builder bld(ctx->program, ctx->block);
 
    uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
-   uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
-   uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
-   uint32_t output_vertex_size = num_tcs_outputs * 16;
+   uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
    uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
-   uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+   uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
 
    std::pair<Temp, unsigned> offs = instr
                                     ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
@@ -4205,11 +4241,7 @@ std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx
 {
    Builder bld(ctx->program, ctx->block);
 
-   unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
-                              ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
-                              : ctx->args->options->key.tes.tcs_num_outputs;
-
-   unsigned output_vertex_size = num_tcs_outputs * 16;
+   unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
    unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
    unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
    unsigned attr_stride = ctx->tcs_num_patches;
@@ -4230,10 +4262,12 @@ std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx
 
 bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
 {
+   assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
    if (mask == 0)
       return false;
 
-   unsigned off = nir_intrinsic_base(instr) * 4u;
+   unsigned drv_loc = nir_intrinsic_base(instr);
    nir_src *off_src = nir_get_io_offset_src(instr);
 
    if (!nir_src_is_const(*off_src)) {
@@ -4242,15 +4276,10 @@ bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr
    }
 
    *indirect = false;
-   off += nir_src_as_uint(*off_src) * 16u;
-
-   while (mask) {
-      unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0);
-      if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u)
-         return true;
-   }
-
-   return false;
+   uint64_t slot = per_vertex
+                   ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
+                   : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
+   return (((uint64_t) 1) << slot) & mask;
 }
 
 bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -4269,10 +4298,12 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
    if (instr->src[0].ssa->bit_size == 64)
       write_mask = widen_mask(write_mask, 2);
 
+   RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
+
    for (unsigned i = 0; i < 8; ++i) {
       if (write_mask & (1 << i)) {
          ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
-         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
+         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
       }
       idx++;
    }
@@ -4344,9 +4375,8 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
          /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
           * GFX9+: LS is merged into HS, but still uses the same LDS layout.
           */
-         unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
          Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
-         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
+         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
       } else {
          unreachable("Invalid LS or ES stage");
       }
@@ -4477,10 +4507,40 @@ void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp
    Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
 
    Builder bld(ctx->program, ctx->block);
-   Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
-   if (ctx->program->has_16bank_lds)
-      interp_p1.instr->operands[0].setLateKill(true);
-   bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component);
+
+   if (dst.regClass() == v2b) {
+      if (ctx->program->has_16bank_lds) {
+         assert(ctx->options->chip_class <= GFX8);
+         Builder::Result interp_p1 =
+            bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
+                       Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
+         interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
+                                coord1, bld.m0(prim_mask), interp_p1, idx, component);
+         bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
+                 bld.m0(prim_mask), interp_p1, idx, component);
+      } else {
+         aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
+
+         if (ctx->options->chip_class == GFX8)
+            interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
+
+         Builder::Result interp_p1 =
+            bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
+                       coord1, bld.m0(prim_mask), idx, component);
+         bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
+                    interp_p1, idx, component);
+      }
+   } else {
+      Builder::Result interp_p1 =
+         bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
+                    bld.m0(prim_mask), idx, component);
+
+      if (ctx->program->has_16bank_lds)
+         interp_p1.instr->operands[0].setLateKill(true);
+
+      bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
+                 bld.m0(prim_mask), interp_p1, idx, component);
+   }
 }
 
 void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
@@ -4646,6 +4706,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
 
       unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
       unsigned component = nir_intrinsic_component(instr);
+      unsigned bitsize = instr->dest.ssa.bit_size;
       unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
       uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
       uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
@@ -4702,7 +4763,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
 
       /* load channels */
       while (channel_start < num_channels) {
-         unsigned fetch_size = num_channels - channel_start;
+         unsigned fetch_component = num_channels - channel_start;
          unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
          bool expanded = false;
 
@@ -4714,15 +4775,17 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
                           vtx_info->chan_byte_size == 4;
          unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
          if (!use_mubuf) {
-            fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
+            fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
          } else {
-            if (fetch_size == 3 && ctx->options->chip_class == GFX6) {
+            if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
                /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
-               fetch_size = 4;
+               fetch_component = 4;
                expanded = true;
             }
          }
 
+         unsigned fetch_bytes = fetch_component * bitsize / 8;
+
          Temp fetch_index = index;
          if (attrib_stride != 0 && fetch_offset > attrib_stride) {
             fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
@@ -4736,19 +4799,37 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
          }
 
          aco_opcode opcode;
-         switch (fetch_size) {
-         case 1:
-            opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
-            break;
+         switch (fetch_bytes) {
          case 2:
-            opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+            assert(!use_mubuf && bitsize == 16);
+            opcode = aco_opcode::tbuffer_load_format_d16_x;
+            break;
+         case 4:
+            if (bitsize == 16) {
+               assert(!use_mubuf);
+               opcode = aco_opcode::tbuffer_load_format_d16_xy;
+            } else {
+               opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
+            }
+            break;
+         case 6:
+            assert(!use_mubuf && bitsize == 16);
+            opcode = aco_opcode::tbuffer_load_format_d16_xyz;
+            break;
+         case 8:
+            if (bitsize == 16) {
+               assert(!use_mubuf);
+               opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
+            } else {
+               opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+            }
             break;
-         case 3:
+         case 12:
             assert(ctx->options->chip_class >= GFX7 ||
                    (!use_mubuf && ctx->options->chip_class == GFX6));
             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
             break;
-         case 4:
+         case 16:
             opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
             break;
          default:
@@ -4756,19 +4837,19 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
          }
 
          Temp fetch_dst;
-         if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
+         if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
              !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
                            num_channels <= 3)) {
             direct_fetch = true;
             fetch_dst = dst;
          } else {
-            fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
+            fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
          }
 
          if (use_mubuf) {
             Instruction *mubuf = bld.mubuf(opcode,
                                            Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, true).instr;
+                                           fetch_offset, false, false, true).instr;
             static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
          } else {
             Instruction *mtbuf = bld.mtbuf(opcode,
@@ -4779,14 +4860,15 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
 
          emit_split_vector(ctx, fetch_dst, fetch_dst.size());
 
-         if (fetch_size == 1) {
+         if (fetch_component == 1) {
             channels[channel_start] = fetch_dst;
          } else {
-            for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
-               channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
+            for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
+               channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
+                                                                 bitsize == 16 ? v2b : v1);
          }
 
-         channel_start += fetch_size;
+         channel_start += fetch_component;
       }
 
       if (!direct_fetch) {
@@ -5066,7 +5148,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
 {
    Builder bld(ctx->program, ctx->block);
    Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
-   if (!ctx->divergent_vals[instr->dest.ssa.index])
+   if (!nir_dest_is_divergent(instr->dest))
       index = bld.as_uniform(index);
    unsigned desc_set = nir_intrinsic_desc_set(instr);
    unsigned binding = nir_intrinsic_binding(instr);
@@ -5126,11 +5208,11 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
 
 void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
                  Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
-                 bool glc=false, bool readonly=true)
+                 bool glc=false, bool readonly=true, bool allow_smem=true)
 {
    Builder bld(ctx->program, ctx->block);
 
-   bool use_smem = dst.type() != RegType::vgpr && ((ctx->options->chip_class >= GFX8 && component_size >= 4) || readonly);
+   bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
    if (use_smem)
       offset = bld.as_uniform(offset);
 
@@ -5213,7 +5295,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
    if (offset != 0) // TODO check if index != 0 as well
-      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
    Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
    Temp vec = dst;
    bool trim = false;
@@ -5255,7 +5337,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
       unreachable("unimplemented or forbidden load_push_constant.");
    }
 
-   bld.smem(op, Definition(vec), ptr, index);
+   static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
 
    if (!aligned) {
       Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
@@ -5299,7 +5381,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
 
    Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
    if (base && offset.type() == RegType::sgpr)
-      offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
    else if (base && offset.type() == RegType::vgpr)
       offset = bld.vadd32(bld.def(v1), Operand(base), offset);
 
@@ -5659,8 +5741,12 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
    ctx->block->instructions.emplace_back(std::move(load));
 
    Operand sample_index4;
-   if (sample_index.isConstant() && sample_index.constantValue() < 16) {
-      sample_index4 = Operand(sample_index.constantValue() << 2);
+   if (sample_index.isConstant()) {
+      if (sample_index.constantValue() < 16) {
+         sample_index4 = Operand(sample_index.constantValue() << 2);
+      } else {
+         sample_index4 = Operand(0u);
+      }
    } else if (sample_index.regClass() == s1) {
       sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
    } else {
@@ -6121,10 +6207,19 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
 
-   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+   unsigned access = nir_intrinsic_access(instr);
+   bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
    unsigned size = instr->dest.ssa.bit_size / 8;
+
+   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
+   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+    */
+   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
+   allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
+
    load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
-               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false);
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
 }
 
 void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -6138,9 +6233,17 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
    Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
    rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
 
-   bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
+   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+    */
+   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
+
+   bool smem = !nir_src_is_divergent(instr->src[2]) &&
                ctx->options->chip_class >= GFX8 &&
-               elem_size_bytes >= 4;
+               (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
+               allow_smem;
    if (smem)
       offset = bld.as_uniform(offset);
    bool smem_nonfs = smem && ctx->stage != fragment_fs;
@@ -6160,8 +6263,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
          aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
          store->operands[0] = Operand(rsrc);
          if (offsets[i]) {
-            Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                offset, Operand(offsets[i]));
+            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                                      offset, Operand(offsets[i]));
             store->operands[1] = Operand(off);
          } else {
             store->operands[1] = Operand(offset);
@@ -6169,7 +6272,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
          if (op != aco_opcode::p_fs_buffer_store_smem)
             store->operands[1].setFixed(m0);
          store->operands[2] = Operand(write_datas[i]);
-         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->glc = glc;
          store->dlc = false;
          store->disable_wqm = true;
          store->barrier = barrier_buffer;
@@ -6187,7 +6290,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
          store->operands[3] = Operand(write_datas[i]);
          store->offset = offsets[i];
          store->offen = (offset.type() == RegType::vgpr);
-         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->glc = glc;
          store->dlc = false;
          store->disable_wqm = true;
          store->barrier = barrier_buffer;
@@ -6687,7 +6790,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
          op32 = aco_opcode::ds_write_b32;
          op64 = aco_opcode::ds_write_b64;
          op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
-         op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
+         op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
          break;
       case nir_intrinsic_shared_atomic_comp_swap:
          op32 = aco_opcode::ds_cmpst_b32;
@@ -6696,6 +6799,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
          op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
          num_operands = 4;
          break;
+      case nir_intrinsic_shared_atomic_fadd:
+         op32 = aco_opcode::ds_add_f32;
+         op32_rtn = aco_opcode::ds_add_rtn_f32;
+         op64 = aco_opcode::num_opcodes;
+         op64_rtn = aco_opcode::num_opcodes;
+         break;
       default:
          unreachable("Unhandled shared atomic intrinsic");
    }
@@ -6746,7 +6855,7 @@ Temp get_scratch_resource(isel_context *ctx)
       scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
 
    uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
 
    if (ctx->program->chip_class >= GFX10) {
       rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
@@ -6757,9 +6866,9 @@ Temp get_scratch_resource(isel_context *ctx)
                    S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
    }
 
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
    if (ctx->program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
 
    return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
 }
@@ -6774,10 +6883,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
                         instr->dest.ssa.bit_size / 8u, rsrc};
    info.align_mul = nir_intrinsic_align_mul(instr);
    info.align_offset = nir_intrinsic_align_offset(instr);
-   info.swizzle_component_size = 16;
+   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
    info.can_reorder = false;
    info.soffset = ctx->program->scratch_offset;
-   emit_mubuf_load(ctx, bld, &info);
+   emit_scratch_load(ctx, bld, &info);
 }
 
 void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
@@ -6792,12 +6901,13 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
    unsigned write_count = 0;
    Temp write_datas[32];
    unsigned offsets[32];
+   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
    split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
-                      16, &write_count, write_datas, offsets);
+                      swizzle_component_size, &write_count, write_datas, offsets);
 
    for (unsigned i = 0; i < write_count; i++) {
       aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
+      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
    }
 }
 
@@ -7178,6 +7288,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
       nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
       Temp private_segment_buffer = ctx->program->private_segment_buffer;
+      //TODO: bounds checking?
       if (addr.type() == RegType::sgpr) {
          Operand offset;
          if (const_addr) {
@@ -7350,6 +7461,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_shared_atomic_xor:
    case nir_intrinsic_shared_atomic_exchange:
    case nir_intrinsic_shared_atomic_comp_swap:
+   case nir_intrinsic_shared_atomic_fadd:
       visit_shared_atomic(ctx, instr);
       break;
    case nir_intrinsic_image_deref_load:
@@ -7529,14 +7641,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_shuffle:
    case nir_intrinsic_read_invocation: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
+      if (!nir_src_is_divergent(instr->src[0])) {
          emit_uniform_subgroup(ctx, instr, src);
       } else {
          Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
-         if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
+         if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
             tid = bld.as_uniform(tid);
          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-         if (src.regClass() == v1) {
+         if (src.regClass() == v1b || src.regClass() == v2b) {
+            Temp tmp = bld.tmp(v1);
+            tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp);
+            if (dst.type() == RegType::vgpr)
+               bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
+            else
+               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+         } else if (src.regClass() == v1) {
             emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
          } else if (src.regClass() == v2) {
             Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
@@ -7581,7 +7700,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_read_first_invocation: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      if (src.regClass() == v1) {
+      if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
          emit_wqm(ctx,
                   bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
                   dst);
@@ -7639,7 +7758,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
          nir_intrinsic_cluster_size(instr) : 0;
       cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
 
-      if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
+      if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
          emit_uniform_subgroup(ctx, instr, src);
       } else if (instr->dest.ssa.bit_size == 1) {
          if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
@@ -7666,27 +7785,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
       } else if (cluster_size == 1) {
          bld.copy(Definition(dst), src);
       } else {
-         src = as_vgpr(ctx, src);
+         unsigned bit_size = instr->src[0].ssa->bit_size;
+
+         src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
 
          ReduceOp reduce_op;
          switch (op) {
-         #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
-            CASE(iadd)
-            CASE(imul)
-            CASE(fadd)
-            CASE(fmul)
-            CASE(imin)
-            CASE(umin)
-            CASE(fmin)
-            CASE(imax)
-            CASE(umax)
-            CASE(fmax)
-            CASE(iand)
-            CASE(ior)
-            CASE(ixor)
+         #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
+         #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
+            CASEI(iadd)
+            CASEI(imul)
+            CASEI(imin)
+            CASEI(umin)
+            CASEI(imax)
+            CASEI(umax)
+            CASEI(iand)
+            CASEI(ior)
+            CASEI(ixor)
+            CASEF(fadd)
+            CASEF(fmul)
+            CASEF(fmin)
+            CASEF(fmax)
             default:
                unreachable("unknown reduction op");
-         #undef CASE
+         #undef CASEI
+         #undef CASEF
          }
 
          aco_opcode aco_op;
@@ -7722,7 +7845,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_quad_broadcast: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
          emit_uniform_subgroup(ctx, instr, src);
       } else {
          Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -7739,6 +7862,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
                      bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
                               bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
             emit_wqm(ctx, tmp, dst);
+         } else if (instr->dest.ssa.bit_size == 8) {
+            Temp tmp = bld.tmp(v1);
+            if (ctx->program->chip_class >= GFX8)
+               emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+            else
+               emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+            bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
+         } else if (instr->dest.ssa.bit_size == 16) {
+            Temp tmp = bld.tmp(v1);
+            if (ctx->program->chip_class >= GFX8)
+               emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+            else
+               emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+            bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
          } else if (instr->dest.ssa.bit_size == 32) {
             if (ctx->program->chip_class >= GFX8)
                emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
@@ -7769,7 +7906,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    case nir_intrinsic_quad_swap_diagonal:
    case nir_intrinsic_quad_swizzle_amd: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
          emit_uniform_subgroup(ctx, instr, src);
          break;
       }
@@ -7803,6 +7940,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
             src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
          Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
          emit_wqm(ctx, tmp, dst);
+      } else if (instr->dest.ssa.bit_size == 8) {
+         Temp tmp = bld.tmp(v1);
+         if (ctx->program->chip_class >= GFX8)
+            emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         else
+            emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
+      } else if (instr->dest.ssa.bit_size == 16) {
+         Temp tmp = bld.tmp(v1);
+         if (ctx->program->chip_class >= GFX8)
+            emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         else
+            emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
       } else if (instr->dest.ssa.bit_size == 32) {
          Temp tmp;
          if (ctx->program->chip_class >= GFX8)
@@ -7831,21 +7982,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
    }
    case nir_intrinsic_masked_swizzle_amd: {
       Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
          emit_uniform_subgroup(ctx, instr, src);
          break;
       }
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       uint32_t mask = nir_intrinsic_swizzle_mask(instr);
-      if (dst.regClass() == v1) {
-         emit_wqm(ctx,
-                  bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
-                  dst);
+      if (instr->dest.ssa.bit_size == 1) {
+         assert(src.regClass() == bld.lm);
+         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
+         src = emit_masked_swizzle(ctx, bld, src, mask);
+         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
+         emit_wqm(ctx, tmp, dst);
+      } else if (dst.regClass() == v1b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v2b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v1) {
+         emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst);
       } else if (dst.regClass() == v2) {
          Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
          bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
-         lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
-         hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
+         lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
+         hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
          bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
          emit_split_vector(ctx, dst, 2);
       } else {
@@ -7928,10 +8089,14 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
                get_ssa_temp(ctx, &instr->dest.ssa));
       break;
    }
-   case nir_intrinsic_shader_clock:
-      bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
+   case nir_intrinsic_shader_clock: {
+      aco_opcode opcode =
+         nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
+            aco_opcode::s_memrealtime : aco_opcode::s_memtime;
+      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
       emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
       break;
+   }
    case nir_intrinsic_load_vertex_id_zero_base: {
       Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
       bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
@@ -8223,9 +8388,11 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
 {
    Builder bld(ctx->program, ctx->block);
    bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
-        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
+        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
+        has_clamped_lod = false;
    Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
-        lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
+        lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
+        clamped_lod = Temp();
    std::vector<Temp> coords;
    std::vector<Temp> derivs;
    nir_const_value *sample_index_cv = NULL;
@@ -8247,10 +8414,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          break;
       }
       case nir_tex_src_bias:
-         if (instr->op == nir_texop_txb) {
-            bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
-            has_bias = true;
-         }
+         bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
+         has_bias = true;
          break;
       case nir_tex_src_lod: {
          nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
@@ -8263,6 +8428,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          }
          break;
       }
+      case nir_tex_src_min_lod:
+         clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
+         has_clamped_lod = true;
+         break;
       case nir_tex_src_comparator:
          if (instr->is_shadow) {
             compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
@@ -8303,10 +8472,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
       Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
       Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
-      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
 
+      Operand default_sample = Operand(1u);
+      if (ctx->options->robust_buffer_access) {
+         /* Extract the second dword of the descriptor, if it's
+	  * all zero, then it's a null descriptor.
+	  */
+         Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
+         Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+         default_sample = Operand(is_non_null_descriptor);
+      }
+
+      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
       bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
-               samples, Operand(1u), bld.scc(is_msaa));
+               samples, default_sample, bld.scc(is_msaa));
       return;
    }
 
@@ -8653,6 +8832,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
       args.emplace_back(sample_index);
    if (has_lod)
       args.emplace_back(lod);
+   if (has_clamped_lod)
+      args.emplace_back(clamped_lod);
 
    Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
    aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
@@ -8697,7 +8878,21 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
    aco_opcode opcode = aco_opcode::image_sample;
    if (has_offset) { /* image_sample_*_o */
-      if (has_compare) {
+      if (has_clamped_lod) {
+         if (has_compare) {
+            opcode = aco_opcode::image_sample_c_cl_o;
+            if (has_derivs)
+               opcode = aco_opcode::image_sample_c_d_cl_o;
+            if (has_bias)
+               opcode = aco_opcode::image_sample_c_b_cl_o;
+         } else {
+            opcode = aco_opcode::image_sample_cl_o;
+            if (has_derivs)
+               opcode = aco_opcode::image_sample_d_cl_o;
+            if (has_bias)
+               opcode = aco_opcode::image_sample_b_cl_o;
+         }
+      } else if (has_compare) {
          opcode = aco_opcode::image_sample_c_o;
          if (has_derivs)
             opcode = aco_opcode::image_sample_c_d_o;
@@ -8718,6 +8913,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
          if (has_lod)
             opcode = aco_opcode::image_sample_l_o;
       }
+   } else if (has_clamped_lod) { /* image_sample_*_cl */
+      if (has_compare) {
+         opcode = aco_opcode::image_sample_c_cl;
+         if (has_derivs)
+            opcode = aco_opcode::image_sample_c_d_cl;
+         if (has_bias)
+            opcode = aco_opcode::image_sample_c_b_cl;
+      } else {
+         opcode = aco_opcode::image_sample_cl;
+         if (has_derivs)
+            opcode = aco_opcode::image_sample_d_cl;
+         if (has_bias)
+            opcode = aco_opcode::image_sample_b_cl;
+      }
    } else { /* no offset */
       if (has_compare) {
          opcode = aco_opcode::image_sample_c;
@@ -8743,14 +8952,34 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
    }
 
    if (instr->op == nir_texop_tg4) {
-      if (has_offset) {
-         opcode = aco_opcode::image_gather4_lz_o;
-         if (has_compare)
+      if (has_offset) { /* image_gather4_*_o */
+         if (has_compare) {
             opcode = aco_opcode::image_gather4_c_lz_o;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_c_l_o;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_c_b_o;
+         } else {
+            opcode = aco_opcode::image_gather4_lz_o;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_l_o;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_b_o;
+         }
       } else {
-         opcode = aco_opcode::image_gather4_lz;
-         if (has_compare)
+         if (has_compare) {
             opcode = aco_opcode::image_gather4_c_lz;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_c_l;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_c_b;
+         } else {
+            opcode = aco_opcode::image_gather4_lz;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_l;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_b;
+         }
       }
    } else if (instr->op == nir_texop_lod) {
       opcode = aco_opcode::image_get_lod;
@@ -8801,13 +9030,19 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
 }
 
 
-Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
+Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
 {
    Temp tmp = get_ssa_temp(ctx, ssa);
-   if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
-      return Operand(tmp.regClass());
-   else
+   if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+      return Operand(rc);
+   } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
+      if (ctx->program->wave_size == 64)
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
+      else
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
+   } else {
       return Operand(tmp);
+   }
 }
 
 void visit_phi(isel_context *ctx, nir_phi_instr *instr)
@@ -8816,7 +9051,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
    Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
    assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
 
-   bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
+   bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
    logical |= ctx->block->kind & block_kind_merge;
    aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
 
@@ -8850,7 +9085,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
       if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
          continue;
       cur_pred_idx++;
-      Operand op = get_phi_operand(ctx, src.second);
+      Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
       operands[num_operands++] = op;
       num_defined += !op.isUndefined();
    }
@@ -9510,7 +9745,7 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
    aco_ptr<Pseudo_branch_instruction> branch;
    if_context ic;
 
-   if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
+   if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
       /**
        * Uniform conditionals are represented in the following way*) :
        *
@@ -9538,8 +9773,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
       visit_cf_list(ctx, &if_stmt->else_list);
 
       end_uniform_if(ctx, &ic);
-
-      return !ctx->cf_info.has_branch;
    } else { /* non-uniform condition */
       /**
        * To maintain a logical and linear CFG without critical edges,
@@ -9573,9 +9806,9 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
       visit_cf_list(ctx, &if_stmt->else_list);
 
       end_divergent_if(ctx, &ic);
-
-      return true;
    }
+
+   return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
 }
 
 static bool visit_cf_list(isel_context *ctx,
@@ -9856,6 +10089,7 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
 
    bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
    bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
+   bool is_16bit = values[0].regClass() == v2b;
 
    switch (col_format)
    {
@@ -9886,16 +10120,38 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
    case V_028714_SPI_SHADER_FP16_ABGR:
       enabled_channels = 0x5;
       compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+      if (is_16bit) {
+         if (ctx->options->chip_class >= GFX9) {
+            /* Pack the FP16 values together instead of converting them to
+             * FP32 and back to FP16.
+             * TODO: use p_create_vector and let the compiler optimizes.
+             */
+            compr_op = aco_opcode::v_pack_b32_f16;
+         } else {
+            for (unsigned i = 0; i < 4; i++) {
+               if ((write_mask >> i) & 1)
+                  values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
+            }
+         }
+      }
       break;
 
    case V_028714_SPI_SHADER_UNORM16_ABGR:
       enabled_channels = 0x5;
-      compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+      if (is_16bit && ctx->options->chip_class >= GFX9) {
+         compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
+      } else {
+         compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+      }
       break;
 
    case V_028714_SPI_SHADER_SNORM16_ABGR:
       enabled_channels = 0x5;
-      compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+      if (is_16bit && ctx->options->chip_class >= GFX9) {
+         compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
+      } else {
+         compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+      }
       break;
 
    case V_028714_SPI_SHADER_UINT16_ABGR: {
@@ -9913,6 +10169,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
                                     values[i]);
             }
          }
+      } else if (is_16bit) {
+         for (unsigned i = 0; i < 4; i++) {
+            if ((write_mask >> i) & 1) {
+               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
+               values[i] = Operand(tmp);
+            }
+         }
       }
       break;
    }
@@ -9937,6 +10200,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
                                     values[i]);
             }
          }
+      } else if (is_16bit) {
+         for (unsigned i = 0; i < 4; i++) {
+            if ((write_mask >> i) & 1) {
+               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
+               values[i] = Operand(tmp);
+            }
+         }
       }
       break;
 
@@ -9951,6 +10221,26 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
    if (target == V_008DFC_SQ_EXP_NULL)
       return false;
 
+   /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
+   if (ctx->options->enable_mrt_output_nan_fixup &&
+       !is_16bit &&
+       (col_format == V_028714_SPI_SHADER_32_R ||
+        col_format == V_028714_SPI_SHADER_32_GR ||
+        col_format == V_028714_SPI_SHADER_32_AR ||
+        col_format == V_028714_SPI_SHADER_32_ABGR ||
+        col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+      for (int i = 0; i < 4; i++) {
+         if (!(write_mask & (1 << i)))
+            continue;
+
+         Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
+                               bld.hint_vcc(bld.def(bld.lm)), values[i],
+                               bld.copy(bld.def(v1), Operand(3u)));
+         values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
+                              bld.copy(bld.def(v1), Operand(0u)), isnan);
+      }
+   }
+
    if ((bool) compr_op) {
       for (int i = 0; i < 2; i++) {
          /* check if at least one of the values to be compressed is enabled */
@@ -10071,8 +10361,8 @@ static void write_tcs_tess_factors(isel_context *ctx)
       Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
       bld.mubuf(aco_opcode::buffer_store_dword,
                 /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
-                /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
-                /* disable_wqm */ false, /* glc */ true);
+                /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+                /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
       tf_const_offset += 4;
 
       begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
@@ -10362,7 +10652,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader)
       float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
                         FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
 
-   /* default to preserving fp16 and fp64 denorms, since it's free */
+   /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
+    * the precision seems needed for Wolfenstein: Youngblood to render correctly */
    if (program->next_fp_mode.must_flush_denorms16_64)
       program->next_fp_mode.denorm16_64 = 0;
    else
@@ -10727,8 +11018,6 @@ void select_program(Program *program,
       if (ngg_no_gs && !ngg_early_prim_export(&ctx))
          ngg_emit_nogs_output(&ctx);
 
-      ralloc_free(ctx.divergent_vals);
-
       if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
          /* Outputs of the previous stage are inputs to the next stage */
          ctx.inputs = ctx.outputs;
@@ -10754,16 +11043,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
 {
    isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
 
-   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
-   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
-   program->next_fp_mode.must_flush_denorms32 = false;
-   program->next_fp_mode.must_flush_denorms16_64 = false;
-   program->next_fp_mode.care_about_round32 = false;
-   program->next_fp_mode.care_about_round16_64 = false;
-   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
-   program->next_fp_mode.denorm32 = 0;
-   program->next_fp_mode.round32 = fp_round_ne;
-   program->next_fp_mode.round16_64 = fp_round_ne;
    ctx.block->fp_mode = program->next_fp_mode;
 
    add_startpgm(&ctx);