aco: add a helper for building a trap handler shader

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index de28601744b72f5c8cb085b5ff61353fe89e221b..6f1f8b4e07e701d59d173dd70c4a87f0eeb497bb 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -38,6 +38,23 @@
  namespace aco {
  namespace {
  
+#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
+
+static void _isel_err(isel_context *ctx, const char *file, unsigned line,
+                      const nir_instr *instr, const char *msg)
+{
+   char *out;
+   size_t outsize;
+   FILE *memf = open_memstream(&out, &outsize);
+
+   fprintf(memf, "%s: ", msg);
+   nir_print_instr(instr, memf);
+   fclose(memf);
+
+   _aco_err(ctx->program, file, line, out);
+   free(out);
+}
+
  class loop_info_RAII {
     isel_context* ctx;
     unsigned header_idx_old;
@@ -136,8 +153,11 @@ Temp emit_mbcnt(isel_context *ctx, Definition dst,
  
     if (ctx->program->wave_size == 32) {
        return thread_id_lo;
+   } else if (ctx->program->chip_class <= GFX7) {
+      Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      return thread_id_hi;
     } else {
-      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
        return thread_id_hi;
     }
  }
@@ -169,33 +189,69 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
     if (index.regClass() == s1)
        return bld.readlane(bld.def(s1), data, index);
  
-   Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
-
-   /* Currently not implemented on GFX6-7 */
-   assert(ctx->options->chip_class >= GFX8);
-
-   if (ctx->options->chip_class <= GFX9 || ctx->program->wave_size == 32) {
+   if (ctx->options->chip_class <= GFX7) {
+      /* GFX6-7: there is no bpermute instruction */
+      Operand index_op(index);
+      Operand input_data(data);
+      index_op.setLateKill(true);
+      input_data.setLateKill(true);
+
+      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), index_op, input_data);
+   } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) {
+      /* GFX10 wave64 mode: emulate full-wave bpermute */
+      if (!ctx->has_gfx10_wave64_bpermute) {
+         ctx->has_gfx10_wave64_bpermute = true;
+         ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
+         ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
+      }
+
+      Temp index_is_lo = bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand(31u), index);
+      Builder::Result index_is_lo_split = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo);
+      Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), index_is_lo_split.def(1).getTemp());
+      Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), index_is_lo_split.def(0).getTemp(), index_is_lo_n1);
+      Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
+      Operand input_data(data);
+
+      index_x4.setLateKill(true);
+      input_data.setLateKill(true);
+      same_half.setLateKill(true);
+
+      return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), index_x4, input_data, same_half);
+   } else {
+      /* GFX8-9 or GFX10 wave32: bpermute works normally */
+      Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(2u), index);
        return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data);
     }
+}
  
-   /* GFX10, wave64 mode:
-    * The bpermute instruction is limited to half-wave operation, which means that it can't
-    * properly support subgroup shuffle like older generations (or wave32 mode), so we
-    * emulate it here.
-    */
-   if (!ctx->has_gfx10_wave64_bpermute) {
-      ctx->has_gfx10_wave64_bpermute = true;
-      ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */
-      ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */
-   }
+static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
+{
+   if (ctx->options->chip_class >= GFX8) {
+      unsigned and_mask = mask & 0x1f;
+      unsigned or_mask = (mask >> 5) & 0x1f;
+      unsigned xor_mask = (mask >> 10) & 0x1f;
  
-   Temp lane_id = emit_mbcnt(ctx, bld.def(v1));
-   Temp lane_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), lane_id);
-   Temp index_is_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x20u), index);
-   Temp cmp = bld.vopc(aco_opcode::v_cmp_eq_u32, bld.def(bld.lm, vcc), lane_is_hi, index_is_hi);
+      uint16_t dpp_ctrl = 0xffff;
+
+      // TODO: we could use DPP8 for some swizzles
+      if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
+         unsigned res[4] = {0, 1, 2, 3};
+         for (unsigned i = 0; i < 4; i++)
+            res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
+         dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
+         dpp_ctrl = dpp_row_rr(8);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
+         dpp_ctrl = dpp_row_mirror;
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
+         dpp_ctrl = dpp_row_half_mirror;
+      }
  
-   return bld.reduction(aco_opcode::p_wave64_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc),
-                        bld.vcc(cmp), Operand(v2.as_linear()), index_x4, data, gfx10_wave64_bpermute);
+      if (dpp_ctrl != 0xffff)
+         return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
+   }
+
+   return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
  }
  
  Temp as_vgpr(isel_context *ctx, Temp val)
@@ -304,20 +360,21 @@ void emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components)
        return;
     if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end())
        return;
-   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
-   split->operands[0] = Operand(vec_src);
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
     RegClass rc;
     if (num_components > vec_src.size()) {
-      if (vec_src.type() == RegType::sgpr)
+      if (vec_src.type() == RegType::sgpr) {
+         /* should still help get_alu_src() */
+         emit_split_vector(ctx, vec_src, vec_src.size());
           return;
-
+      }
        /* sub-dword split */
-      assert(vec_src.type() == RegType::vgpr);
        rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword();
     } else {
        rc = RegClass(vec_src.type(), vec_src.size() / num_components);
     }
+   aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)};
+   split->operands[0] = Operand(vec_src);
+   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
     for (unsigned i = 0; i < num_components; i++) {
        elems[i] = {ctx->program->allocateId(), rc};
        split->definitions[i] = Definition(elems[i]);
@@ -395,7 +452,7 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
        bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec);
        hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand(0u));
        if (select != Temp())
-         hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), select);
+         hi = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand(0u), bld.scc(select));
        lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift);
        Temp mid = bld.tmp(s1);
        lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo);
@@ -406,38 +463,67 @@ void byte_align_scalar(isel_context *ctx, Temp vec, Operand offset, Temp dst)
     }
  }
  
-/* this function trims subdword vectors:
- * if dst is vgpr - split the src and create a shrunk version according to the mask.
- * if dst is sgpr - split the src, but move the original to sgpr. */
-void trim_subdword_vector(isel_context *ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask)
+void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, unsigned component_size)
  {
-   assert(vec_src.type() == RegType::vgpr);
-   emit_split_vector(ctx, vec_src, num_components);
-
     Builder bld(ctx->program, ctx->block);
-   std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
-   unsigned component_size = vec_src.bytes() / num_components;
-   RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+   if (offset.isTemp()) {
+      Temp tmp[4] = {vec, vec, vec, vec};
  
-   unsigned k = 0;
-   for (unsigned i = 0; i < num_components; i++) {
-      if (mask & (1 << i))
-         elems[k++] = emit_extract_vector(ctx, vec_src, i, rc);
+      if (vec.size() == 4) {
+         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), Definition(tmp[3]), vec);
+      } else if (vec.size() == 3) {
+         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
+      } else if (vec.size() == 2) {
+         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
+         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
+      }
+      for (unsigned i = 0; i < dst.size(); i++)
+         tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
+
+      vec = tmp[0];
+      if (dst.size() == 2)
+         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
+
+      offset = Operand(0u);
+   }
+
+   unsigned num_components = vec.bytes() / component_size;
+   if (vec.regClass() == dst.regClass()) {
+      assert(offset.constantValue() == 0);
+      bld.copy(Definition(dst), vec);
+      emit_split_vector(ctx, dst, num_components);
+      return;
     }
  
+   emit_split_vector(ctx, vec, num_components);
+   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
+   RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
+
+   assert(offset.constantValue() % component_size == 0);
+   unsigned skip = offset.constantValue() / component_size;
+   for (unsigned i = skip; i < num_components; i++)
+      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
+
+   /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
     if (dst.type() == RegType::vgpr) {
-      assert(dst.bytes() == k * component_size);
-      aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, k, 1)};
-      for (unsigned i = 0; i < k; i++)
-         vec->operands[i] = Operand(elems[i]);
-      vec->definitions[0] = Definition(dst);
-      bld.insert(std::move(vec));
+      num_components = dst.bytes() / component_size;
+      aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
+      for (unsigned i = 0; i < num_components; i++)
+         create_vec->operands[i] = Operand(elems[i]);
+      create_vec->definitions[0] = Definition(dst);
+      bld.insert(std::move(create_vec));
+
+   /* if dst is sgpr - split the src, but move the original to sgpr. */
+   } else if (skip) {
+      vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec);
+      byte_align_scalar(ctx, vec, offset, dst);
     } else {
-      // TODO: alignbyte if mask doesn't start with 1?
-      assert(mask & 1);
-      assert(dst.size() == vec_src.size());
-      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src);
+      assert(dst.size() == vec.size());
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
     }
+
     ctx->allocated_vec.emplace(dst.id(), elems);
  }
  
@@ -468,6 +554,104 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1
     return emit_wqm(ctx, tmp, dst);
  }
  
+Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp())
+{
+   if (!dst.id()) {
+      if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
+         dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
+      else
+         dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
+   }
+
+   if (dst.bytes() == src.bytes() && dst_bits < src_bits)
+      return bld.copy(Definition(dst), src);
+   else if (dst.bytes() < src.bytes())
+      return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+
+   Temp tmp = dst;
+   if (dst_bits == 64)
+      tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
+
+   if (tmp == src) {
+   } else if (src.regClass() == s1) {
+      if (is_signed)
+         bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
+      else
+         bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
+   } else if (ctx->options->chip_class >= GFX8) {
+      assert(src_bits != 8 || src.regClass() == v1b);
+      assert(src_bits != 16 || src.regClass() == v2b);
+      aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+      sdwa->operands[0] = Operand(src);
+      sdwa->definitions[0] = Definition(tmp);
+      if (is_signed)
+         sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
+      else
+         sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
+      sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
+      bld.insert(std::move(sdwa));
+   } else {
+      assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
+      aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
+      bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
+   }
+
+   if (dst_bits == 64) {
+      if (is_signed && dst.regClass() == s2) {
+         Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
+      } else if (is_signed && dst.regClass() == v2) {
+         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
+      } else {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
+      }
+   }
+
+   return dst;
+}
+
+enum sgpr_extract_mode {
+   sgpr_extract_sext,
+   sgpr_extract_zext,
+   sgpr_extract_undef,
+};
+
+Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src, sgpr_extract_mode mode)
+{
+   Temp vec = get_ssa_temp(ctx, src->src.ssa);
+   unsigned src_size = src->src.ssa->bit_size;
+   unsigned swizzle = src->swizzle[0];
+
+   if (vec.size() > 1) {
+      assert(src_size == 16);
+      vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
+      swizzle = swizzle & 1;
+   }
+
+   Builder bld(ctx->program, ctx->block);
+   unsigned offset = src_size * swizzle;
+   Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
+
+   if (mode == sgpr_extract_undef && swizzle == 0) {
+      bld.copy(Definition(tmp), vec);
+   } else if (mode == sgpr_extract_undef || (offset == 24 && mode == sgpr_extract_zext)) {
+      bld.sop2(aco_opcode::s_lshr_b32, Definition(tmp), bld.def(s1, scc), vec, Operand(offset));
+   } else if (src_size == 8 && swizzle == 0 && mode == sgpr_extract_sext) {
+      bld.sop1(aco_opcode::s_sext_i32_i8, Definition(tmp), vec);
+   } else if (src_size == 16 && swizzle == 0 && mode == sgpr_extract_sext) {
+      bld.sop1(aco_opcode::s_sext_i32_i16, Definition(tmp), vec);
+   } else {
+      aco_opcode op = mode == sgpr_extract_zext ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
+      bld.sop2(op, Definition(tmp), bld.def(s1, scc), vec, Operand((src_size << 16) | offset));
+   }
+
+   if (dst.regClass() == s2)
+      convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
+
+   return dst;
+}
+
  Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
  {
     if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
@@ -491,22 +675,8 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
     if (elem_size < 4 && vec.type() == RegType::sgpr) {
        assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
        assert(size == 1);
-      unsigned swizzle = src.swizzle[0];
-      if (vec.size() > 1) {
-         assert(src.src.ssa->bit_size == 16);
-         vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
-         swizzle = swizzle & 1;
-      }
-      if (swizzle == 0)
-         return vec;
-
-      Temp dst{ctx->program->allocateId(), s1};
-      aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 1)};
-      bfe->operands[0] = Operand(vec);
-      bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
-      bfe->definitions[0] = Definition(dst);
-      ctx->block->instructions.emplace_back(std::move(bfe));
-      return dst;
+      return extract_8_16_bit_sgpr_element(
+         ctx, Temp(ctx->program->allocateId(), s1), &src, sgpr_extract_undef);
     }
  
     RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
@@ -545,6 +715,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
     sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
     sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
     sop2->definitions[0] = Definition(dst);
+   if (instr->no_unsigned_wrap)
+      sop2->definitions[0].setNUW(true);
     if (writes_scc)
        sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
     ctx->block->instructions.emplace_back(std::move(sop2));
@@ -554,6 +726,8 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
                             bool commutative, bool swap_srcs=false, bool flush_denorms = false)
  {
     Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+
     Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]);
     Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]);
     if (src1.type() == RegType::sgpr) {
@@ -575,6 +749,31 @@ void emit_vop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
     }
  }
  
+void emit_vop2_instruction_logic64(isel_context *ctx, nir_alu_instr *instr,
+                                   aco_opcode op, Temp dst)
+{
+   Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
+
+   Temp src0 = get_alu_src(ctx, instr->src[0]);
+   Temp src1 = get_alu_src(ctx, instr->src[1]);
+
+   if (src1.type() == RegType::sgpr) {
+      assert(src0.type() == RegType::vgpr);
+      std::swap(src0, src1);
+   }
+
+   Temp src00 = bld.tmp(src0.type(), 1);
+   Temp src01 = bld.tmp(src0.type(), 1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0);
+   Temp src10 = bld.tmp(v1);
+   Temp src11 = bld.tmp(v1);
+   bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1);
+   Temp lo = bld.vop2(op, bld.def(v1), src00, src10);
+   Temp hi = bld.vop2(op, bld.def(v1), src01, src11);
+   bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
+}
+
  void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst,
                              bool flush_denorms = false)
  {
@@ -592,6 +791,7 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
        src2 = as_vgpr(ctx, src2);
  
     Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
     if (flush_denorms && ctx->program->chip_class < GFX9) {
        assert(dst.size() == 1);
        Temp tmp = bld.vop3(op, Definition(dst), src0, src1, src2);
@@ -604,7 +804,12 @@ void emit_vop3a_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode
  void emit_vop1_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
  {
     Builder bld(ctx->program, ctx->block);
-   bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
+   bld.is_precise = instr->exact;
+   if (dst.type() == RegType::sgpr)
+      bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
+                 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0])));
+   else
+      bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0]));
  }
  
  void emit_vopc_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode op, Temp dst)
@@ -709,9 +914,8 @@ void emit_comparison(isel_context *ctx, nir_alu_instr *instr, Temp dst,
  {
     aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op : instr->src[0].src.ssa->bit_size == 32 ? s32_op : aco_opcode::num_opcodes;
     aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op : instr->src[0].src.ssa->bit_size == 32 ? v32_op : v16_op;
-   bool divergent_vals = ctx->divergent_vals[instr->dest.dest.ssa.index];
     bool use_valu = s_op == aco_opcode::num_opcodes ||
-                   divergent_vals ||
+                   nir_dest_is_divergent(instr->dest.dest) ||
                     ctx->allocated[instr->src[0].src.ssa->index].type() == RegType::vgpr ||
                     ctx->allocated[instr->src[1].src.ssa->index].type() == RegType::vgpr;
     aco_opcode op = use_valu ? v_op : s_op;
@@ -748,18 +952,12 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
  
     if (dst.type() == RegType::vgpr) {
        aco_ptr<Instruction> bcsel;
-      if (dst.regClass() == v2b) {
-         then = as_vgpr(ctx, then);
-         els = as_vgpr(ctx, els);
-
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), els, then, cond);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
-      } else if (dst.regClass() == v1) {
+      if (dst.size() == 1) {
           then = as_vgpr(ctx, then);
           els = as_vgpr(ctx, els);
  
           bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond);
-      } else if (dst.regClass() == v2) {
+      } else if (dst.size() == 2) {
           Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then);
           Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1);
@@ -770,9 +968,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
  
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        return;
     }
@@ -783,16 +979,14 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
        assert(els.regClass() == bld.lm);
     }
  
-   if (!ctx->divergent_vals[instr->src[0].src.ssa->index]) { /* uniform condition and values in sgpr */
+   if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */
        if (dst.regClass() == s1 || dst.regClass() == s2) {
           assert((then.regClass() == s1 || then.regClass() == s2) && els.regClass() == then.regClass());
           assert(dst.size() == then.size());
           aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
           bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
        } else {
-         fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
        }
        return;
     }
@@ -882,7 +1076,8 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val);
  
     /* Extract the exponent and compute the unbiased value. */
-   Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f64, bld.def(v1), val);
+   Temp exponent = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand(20u), Operand(11u));
+   exponent = bld.vsub32(bld.def(v1), exponent, Operand(1023u));
  
     /* Extract the fractional part. */
     Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand(-1u), Operand(0x000fffffu));
@@ -898,7 +1093,7 @@ Temp emit_trunc_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp);
  
     /* Get the sign bit. */
-   Temp sign = bld.vop2(aco_opcode::v_ashr_i32, bld.def(v1), Operand(31u), val_hi);
+   Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x80000000u), val_hi);
  
     /* Decide the operation to apply depending on the unbiased exponent. */
     Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, Operand(0u));
@@ -916,7 +1111,8 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     if (ctx->options->chip_class >= GFX7)
        return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
  
-   /* GFX6 doesn't support V_FLOOR_F64, lower it. */
+   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
+    * lowered at NIR level for precision reasons). */
     Temp src0 = as_vgpr(ctx, val);
  
     Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
@@ -942,67 +1138,14 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     return add->definitions[0].getTemp();
  }
  
-Temp convert_int(Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
-   if (!dst.id()) {
-      if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
-         dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
-      else
-         dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
-   }
-
-   if (dst.bytes() == src.bytes() && dst_bits < src_bits)
-      return bld.copy(Definition(dst), src);
-   else if (dst.bytes() < src.bytes())
-      return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
-
-   Temp tmp = dst;
-   if (dst_bits == 64)
-      tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
-
-   if (tmp == src) {
-   } else if (src.regClass() == s1) {
-      if (is_signed)
-         bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
-      else
-         bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
-   } else {
-      assert(src_bits != 8 || src.regClass() == v1b);
-      assert(src_bits != 16 || src.regClass() == v2b);
-      aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
-      sdwa->operands[0] = Operand(src);
-      sdwa->definitions[0] = Definition(tmp);
-      if (is_signed)
-         sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
-      else
-         sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
-      sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
-      bld.insert(std::move(sdwa));
-   }
-
-   if (dst_bits == 64) {
-      if (is_signed && dst.regClass() == s2) {
-         Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
-      } else if (is_signed && dst.regClass() == v2) {
-         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
-      } else {
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
-      }
-   }
-
-   return dst;
-}
-
  void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  {
     if (!instr->dest.dest.is_ssa) {
-      fprintf(stderr, "nir alu dst not in ssa: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "nir alu dst not in ssa");
        abort();
     }
     Builder bld(ctx->program, ctx->block);
+   bld.is_precise = instr->exact;
     Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa);
     switch(instr->op) {
     case nir_op_vec2:
@@ -1015,8 +1158,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  
        if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)};
-         for (unsigned i = 0; i < num; ++i)
-            vec->operands[i] = Operand{elems[i]};
+         RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u);
+         for (unsigned i = 0; i < num; ++i) {
+            if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword())
+               vec->operands[i] = Operand(emit_extract_vector(ctx, elems[i], 0, elem_rc));
+            else
+               vec->operands[i] = Operand{elems[i]};
+         }
           vec->definitions[0] = Definition(dst);
           ctx->block->instructions.emplace_back(std::move(vec));
           ctx->allocated_vec.emplace(dst.id(), elems);
@@ -1055,13 +1203,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.sop1(aco_opcode::s_mov_b64, Definition(dst), src);
           else
              unreachable("wrong src register class for nir_op_imov");
-      } else if (dst.regClass() == v1) {
-         bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
-      } else if (dst.regClass() == v2) {
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
        } else {
-         nir_print_instr(&instr->instr, stderr);
-         unreachable("Should have been lowered to scalar.");
+         if (dst.regClass() == v1)
+            bld.vop1(aco_opcode::v_mov_b32, Definition(dst), src);
+         else if (dst.regClass() == v1b ||
+                  dst.regClass() == v2b ||
+                  dst.regClass() == v2)
+            bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
+         else
+            unreachable("wrong src register class for nir_op_imov");
        }
        break;
     }
@@ -1075,13 +1225,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.sop2(Builder::s_and, Definition(dst), bld.def(s1, scc), tmp, Operand(exec, bld.lm));
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst);
+      } else if (dst.regClass() == v2) {
+         Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
+         lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo);
+         hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
        } else if (dst.type() == RegType::sgpr) {
           aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
           bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1108,9 +1262,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1121,9 +1273,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp src = get_alu_src(ctx, instr->src[0]);
           bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1151,9 +1301,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1163,9 +1311,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1175,9 +1321,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1187,9 +1331,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1199,9 +1341,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1210,14 +1350,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           emit_boolean_logic(ctx, instr, Builder::s_or, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst);
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true);
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1226,14 +1366,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           emit_boolean_logic(ctx, instr, Builder::s_and, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst);
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true);
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1242,14 +1382,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           emit_boolean_logic(ctx, instr, Builder::s_xor, dst);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true);
+      } else if (dst.regClass() == v2) {
+         emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst);
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true);
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1267,9 +1407,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1287,9 +1425,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1307,9 +1443,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1322,9 +1456,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (src.regClass() == s2) {
           bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1351,9 +1483,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
           bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1363,9 +1493,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1401,9 +1529,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1432,9 +1558,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1468,9 +1592,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1504,9 +1626,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1539,9 +1659,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1552,9 +1670,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1568,9 +1684,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                               as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
           bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1584,9 +1698,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                               as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
           bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1594,17 +1706,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true);
        } else if (dst.regClass() == v2) {
           bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1612,17 +1720,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true);
        } else if (dst.regClass() == v2) {
           bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1630,12 +1734,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = get_alu_src(ctx, instr->src[1]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
           if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, tmp, false);
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false);
           else
-            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+            emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true);
        } else if (dst.regClass() == v1) {
           if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr)
              emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false);
@@ -1647,9 +1749,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
           sub->neg[1] = true;
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1658,9 +1758,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
           // TODO: check fp_mode.must_flush_denorms16_64
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
        } else if (dst.regClass() == v2) {
@@ -1671,9 +1769,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1682,9 +1778,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1]));
        if (dst.regClass() == v2b) {
           // TODO: check fp_mode.must_flush_denorms16_64
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, tmp, true);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true);
        } else if (dst.regClass() == v1) {
           emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, ctx->block->fp_mode.must_flush_denorms32);
        } else if (dst.regClass() == v2) {
@@ -1695,51 +1789,37 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fmax3: {
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f16, dst, false);
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fmin3: {
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f16, dst, false);
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fmed3: {
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f16, dst, false);
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1747,9 +1827,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1757,9 +1835,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1767,9 +1843,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1777,9 +1851,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1787,9 +1859,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1797,9 +1867,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1812,8 +1880,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
        Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
        Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
-      sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
-      tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
+      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
+                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma), Operand(0x3f000000u/*0.5*/));
+      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
+                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma), Operand(0x3f000000u/*0.5*/));
        bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
        break;
     }
@@ -1832,24 +1902,23 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_frsq: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rsq_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_rsq(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fneg: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x8000u), as_vgpr(ctx, src));
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
+         bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
              src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
@@ -1862,17 +1931,16 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fabs: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFu), as_vgpr(ctx, src));
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
+         bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
              src = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand(0x3f800000u), as_vgpr(ctx, src));
@@ -1885,17 +1953,14 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fsat: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop3(aco_opcode::v_med3_f16, bld.def(v1), Operand(0u), Operand(0x3f800000u), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand((uint16_t)0u), Operand((uint16_t)0x3c00), src);
        } else if (dst.regClass() == v1) {
           bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand(0u), Operand(0x3f800000u), src);
           /* apparently, it is not necessary to flush denorms if this instruction is used with these operands */
@@ -1905,109 +1970,88 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
           vop3->clamp = true;
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_flog2: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_log_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_log2(ctx, bld, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_frcp: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rcp_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_rcp(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fexp2: {
        if (dst.regClass() == v2b) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.vop1(aco_opcode::v_exp_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fsqrt: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_sqrt_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_sqrt(ctx, bld, Definition(dst), src);
        } else if (dst.regClass() == v2) {
+         /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_ffract: {
        if (dst.regClass() == v2b) {
-         Temp src = get_alu_src(ctx, instr->src[0]);
-         Temp tmp = bld.vop1(aco_opcode::v_fract_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst);
        } else if (dst.regClass() == v2) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_ffloor: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_floor_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst);
        } else if (dst.regClass() == v2) {
           emit_floor_f64(ctx, bld, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fceil: {
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_ceil_f16, bld.def(v1), src0);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst);
        } else if (dst.regClass() == v2) {
@@ -2028,33 +2072,27 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_ftrunc: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_trunc_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst);
        } else if (dst.regClass() == v2) {
           emit_trunc_f64(ctx, bld, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fround_even: {
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_rndne_f16, bld.def(v1), src0);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst);
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst);
        } else if (dst.regClass() == v2) {
@@ -2085,9 +2123,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2095,13 +2131,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_fcos: {
        Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0]));
        aco_ptr<Instruction> norm;
-      Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
        if (dst.regClass() == v2b) {
+         Temp half_pi = bld.copy(bld.def(s1), Operand(0x3118u));
           Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src);
           aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16;
-         tmp = bld.vop1(opcode, bld.def(v1), tmp);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop1(opcode, Definition(dst), tmp);
        } else if (dst.regClass() == v1) {
+         Temp half_pi = bld.copy(bld.def(s1), Operand(0x3e22f983u));
           Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src);
  
           /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */
@@ -2111,9 +2147,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
           bld.vop1(opcode, Definition(dst), tmp);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2121,33 +2155,26 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src0 = get_alu_src(ctx, instr->src[0]);
        Temp src1 = get_alu_src(ctx, instr->src[1]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.tmp(v1);
-         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, tmp, false);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false);
        } else if (dst.regClass() == v1) {
           bld.vop3(aco_opcode::v_ldexp_f32, Definition(dst), as_vgpr(ctx, src0), src1);
        } else if (dst.regClass() == v2) {
           bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_frexp_sig: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
-         Temp tmp = bld.vop1(aco_opcode::v_frexp_mant_f16, bld.def(v1), src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop1(aco_opcode::v_frexp_mant_f16, Definition(dst), src);
        } else if (dst.regClass() == v1) {
           bld.vop1(aco_opcode::v_frexp_mant_f32, Definition(dst), src);
        } else if (dst.regClass() == v2) {
           bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2156,15 +2183,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (instr->src[0].src.ssa->bit_size == 16) {
           Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src);
           tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand(0u));
-         convert_int(bld, tmp, 8, 32, true, dst);
+         convert_int(ctx, bld, tmp, 8, 32, true, dst);
        } else if (instr->src[0].src.ssa->bit_size == 32) {
           bld.vop1(aco_opcode::v_frexp_exp_i32_f32, Definition(dst), src);
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2176,8 +2201,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), one, src, cond);
           cond = bld.vopc(aco_opcode::v_cmp_le_f16, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), minus_one, src, cond);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), minus_one, src, cond);
        } else if (dst.regClass() == v1) {
           Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f32, bld.hint_vcc(bld.def(bld.lm)), Operand(0u), src);
           src = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0x3f800000u), src, cond);
@@ -2194,9 +2218,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2205,16 +2227,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 64)
           src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      src = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
+         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
+          * keep value numbering and the scheduler simpler.
+          */
+         bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
+      else
+         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
        break;
     }
     case nir_op_f2f16_rtz: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 64)
           src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      src = bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), src, Operand(0u));
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), src);
+      bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, Operand(0u));
        break;
     }
     case nir_op_f2f32: {
@@ -2223,9 +2249,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2240,16 +2264,17 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        assert(dst.regClass() == v2b);
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 8)
-         src = convert_int(bld, src, 8, 16, true);
-      Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_i16, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         src = convert_int(ctx, bld, src, 8, 16, true);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         src = convert_int(ctx, bld, src, 64, 32, false);
+      bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
        break;
     }
     case nir_op_i2f32: {
        assert(dst.size() == 1);
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size <= 16)
-         src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+         src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
        bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
        break;
     }
@@ -2257,7 +2282,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (instr->src[0].src.ssa->bit_size <= 32) {
           Temp src = get_alu_src(ctx, instr->src[0]);
           if (instr->src[0].src.ssa->bit_size <= 16)
-            src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
           bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src);
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           Temp src = get_alu_src(ctx, instr->src[0]);
@@ -2270,9 +2295,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2280,20 +2303,20 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        assert(dst.regClass() == v2b);
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 8)
-         src = convert_int(bld, src, 8, 16, false);
-      Temp tmp = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v1), src);
-      bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         src = convert_int(ctx, bld, src, 8, 16, false);
+      else if (instr->src[0].src.ssa->bit_size == 64)
+         src = convert_int(ctx, bld, src, 64, 32, false);
+      bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src);
        break;
     }
     case nir_op_u2f32: {
        assert(dst.size() == 1);
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 8) {
-         //TODO: we should use v_cvt_f32_ubyte1/v_cvt_f32_ubyte2/etc depending on the register assignment
           bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src);
        } else {
           if (instr->src[0].src.ssa->bit_size == 16)
-            src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, true);
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true);
           bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src);
        }
        break;
@@ -2302,7 +2325,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (instr->src[0].src.ssa->bit_size <= 32) {
           Temp src = get_alu_src(ctx, instr->src[0]);
           if (instr->src[0].src.ssa->bit_size <= 16)
-            src = convert_int(bld, src, instr->src[0].src.ssa->bit_size, 32, false);
+            src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false);
           bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src);
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           Temp src = get_alu_src(ctx, instr->src[0]);
@@ -2314,42 +2337,28 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
           bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_f2i8:
     case nir_op_f2i16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 16)
-         src = bld.vop1(aco_opcode::v_cvt_i16_f16, bld.def(v1), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst);
        else if (instr->src[0].src.ssa->bit_size == 32)
-         src = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
        else
-         src = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src);
-
-      if (dst.type() == RegType::vgpr)
-         bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
-      else
-         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
        break;
     }
     case nir_op_f2u8:
     case nir_op_f2u16: {
-      Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 16)
-         src = bld.vop1(aco_opcode::v_cvt_u16_f16, bld.def(v1), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst);
        else if (instr->src[0].src.ssa->bit_size == 32)
-         src = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src);
-      else
-         src = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src);
-
-      if (dst.type() == RegType::vgpr)
-         bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
        else
-         bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src);
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
        break;
     }
     case nir_op_f2i32: {
@@ -2363,23 +2372,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                         bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp));
           }
        } else if (instr->src[0].src.ssa->bit_size == 32) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst);
        } else if (instr->src[0].src.ssa->bit_size == 64) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_i32_f64, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2394,23 +2391,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                         bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp));
           }
        } else if (instr->src[0].src.ssa->bit_size == 32) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst);
        } else if (instr->src[0].src.ssa->bit_size == 64) {
-         if (dst.type() == RegType::vgpr)
-            bld.vop1(aco_opcode::v_cvt_u32_f64, Definition(dst), src);
-         else
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst),
-                       bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), src));
-
+         emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2487,9 +2472,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2559,9 +2542,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2574,8 +2555,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand(0x3c00u), src);
        } else if (dst.regClass() == v2b) {
           Temp one = bld.copy(bld.def(v1), Operand(0x3c00u));
-         Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), one, src);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
+         bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), one, src);
        } else {
           unreachable("Wrong destination register class for nir_op_b2f16.");
        }
@@ -2615,31 +2595,52 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_i2i16:
     case nir_op_i2i32:
     case nir_op_i2i64: {
-      convert_int(bld, get_alu_src(ctx, instr->src[0]),
-                  instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
+      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
+         /* no need to do the extract in get_alu_src() */
+         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ?
+                                  sgpr_extract_sext : sgpr_extract_undef;
+         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
+      } else {
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
+                     instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
+      }
        break;
     }
     case nir_op_u2u8:
     case nir_op_u2u16:
     case nir_op_u2u32:
     case nir_op_u2u64: {
-      convert_int(bld, get_alu_src(ctx, instr->src[0]),
-                  instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
+      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
+         /* no need to do the extract in get_alu_src() */
+         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ?
+                                  sgpr_extract_zext : sgpr_extract_undef;
+         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
+      } else {
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
+                     instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
+      }
        break;
     }
     case nir_op_b2b32:
-   case nir_op_b2i32: {
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32:
+   case nir_op_b2i64: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        assert(src.regClass() == bld.lm);
  
-      if (dst.regClass() == s1) {
+      Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
+      if (tmp.regClass() == s1) {
           // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
-         bool_to_scalar_condition(ctx, src, dst);
-      } else if (dst.regClass() == v1) {
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
+         bool_to_scalar_condition(ctx, src, tmp);
+      } else if (tmp.type() == RegType::vgpr) {
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
        } else {
           unreachable("Invalid register class for b2i32");
        }
+
+      if (tmp != dst)
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
        break;
     }
     case nir_op_b2b1:
@@ -2713,40 +2714,38 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp src0 = bld.tmp(v1);
           Temp src1 = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(src0), Definition(src1), src);
-         if (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)
+         if (0 && (!ctx->block->fp_mode.care_about_round32 || ctx->block->fp_mode.round32 == fp_round_tz)) {
              bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src0, src1);
-         else
-            bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst),
-                     bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
-                     bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
+         } else {
+            src0 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src0);
+            src1 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src1);
+            if (ctx->program->chip_class >= GFX10) {
+               /* the high bits of v_cvt_f16_f32 isn't zero'd on GFX10 */
+               bld.vop3(aco_opcode::v_pack_b32_f16, Definition(dst), src0, src1);
+            } else {
+               bld.vop3(aco_opcode::v_cvt_pk_u16_u32, Definition(dst), src0, src1);
+            }
+         }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_unpack_half_2x16_split_x: {
        if (dst.regClass() == v1) {
-         Builder bld(ctx->program, ctx->block);
           bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_unpack_half_2x16_split_y: {
        if (dst.regClass() == v1) {
-         Builder bld(ctx->program, ctx->block);
           /* TODO: use SDWA here */
           bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
                    bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2787,9 +2786,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2832,9 +2829,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2907,9 +2902,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (src.regClass() == s2) {
           bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2925,7 +2918,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, aco_opcode::v_cmp_eq_f64);
        break;
     }
-   case nir_op_fne: {
+   case nir_op_fneu: {
        emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, aco_opcode::v_cmp_neq_f64);
        break;
     }
@@ -2996,9 +2989,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        break;
     }
     default:
-      fprintf(stderr, "Unknown NIR ALU instr: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unknown NIR ALU instr");
     }
  }
  
@@ -3051,35 +3042,6 @@ uint32_t widen_mask(uint32_t mask, unsigned multiplier)
     return new_mask;
  }
  
-void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst)
-{
-   Builder bld(ctx->program, ctx->block);
-   if (offset.isTemp()) {
-      Temp tmp[3] = {vec, vec, vec};
-
-      if (vec.size() == 3) {
-         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
-         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
-      } else if (vec.size() == 2) {
-         tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
-         bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
-      }
-      for (unsigned i = 0; i < dst.size(); i++)
-         tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset);
-
-      vec = tmp[0];
-      if (dst.size() == 2)
-         vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
-
-      offset = Operand(0u);
-   }
-
-   if (vec.bytes() == dst.bytes() && offset.constantValue() == 0)
-      bld.copy(Definition(dst), vec);
-   else
-      trim_subdword_vector(ctx, vec, dst, vec.bytes(), ((1 << dst.bytes()) - 1) << offset.constantValue());
-}
-
  struct LoadEmitInfo {
     Operand offset;
     Temp dst;
@@ -3093,8 +3055,7 @@ struct LoadEmitInfo {
  
     bool glc = false;
     unsigned swizzle_component_size = 0;
-   barrier_interaction barrier = barrier_none;
-   bool can_reorder = true;
+   memory_sync_info sync;
     Temp soffset = Temp(0, s1);
  };
  
@@ -3124,7 +3085,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        int byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1;
  
        if (byte_align) {
-         if ((bytes_needed > 2 || !supports_8bit_16bit_loads) && byte_align_loads) {
+         if ((bytes_needed > 2 ||
+              (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) ||
+              !supports_8bit_16bit_loads) && byte_align_loads) {
              if (info->component_stride) {
                 assert(supports_8bit_16bit_loads && "unimplemented");
                 bytes_needed = 2;
@@ -3186,7 +3149,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
  
        /* align offset down if needed */
        Operand aligned_offset = offset;
+      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        if (need_to_align_offset) {
+         align = 4;
           Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
           if (offset.isConstant()) {
              aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
@@ -3206,12 +3171,18 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
                                  bld.copy(bld.def(s1), aligned_offset);
  
-      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
                            reduced_const_offset, byte_align ? Temp() : info->dst);
  
+      /* the callback wrote directly to dst */
+      if (val == info->dst) {
+         assert(num_vals == 0);
+         emit_split_vector(ctx, info->dst, info->num_components);
+         return;
+      }
+
        /* shift result right if needed */
-      if (byte_align) {
+      if (info->component_size < 4 && byte_align_loads) {
           Operand align((uint32_t)byte_align);
           if (byte_align == -1) {
              if (offset.isConstant())
@@ -3222,15 +3193,12 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
                 align = offset;
           }
  
-         if (align.isTemp() || align.constantValue()) {
-            assert(val.bytes() >= load_size && "unimplemented");
-            Temp new_val = bld.tmp(RegClass::get(val.type(), load_size));
-            if (val.type() == RegType::sgpr)
-               byte_align_scalar(ctx, val, align, new_val);
-            else
-               byte_align_vector(ctx, val, align, new_val);
-            val = new_val;
-         }
+         assert(val.bytes() >= load_size && "unimplemented");
+         if (val.type() == RegType::sgpr)
+            byte_align_scalar(ctx, val, align, info->dst);
+         else
+            byte_align_vector(ctx, val, align, info->dst, component_size);
+         return;
        }
  
        /* add result to list and advance */
@@ -3246,13 +3214,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        vals[num_vals++] = val;
     }
  
-   /* the callback wrote directly to dst */
-   if (vals[0] == info->dst) {
-      assert(num_vals == 1);
-      emit_split_vector(ctx, info->dst, info->num_components);
-      return;
-   }
-
     /* create array of components */
     unsigned components_split = 0;
     std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec;
@@ -3271,7 +3232,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        if (num_tmps > 1) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
              aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
-         for (unsigned i = 0; i < num_vals; i++)
+         for (unsigned i = 0; i < num_tmps; i++)
              vec->operands[i] = Operand(tmp[i]);
           tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
           vec->definitions[0] = Definition(tmp[0]);
@@ -3336,271 +3297,572 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
     }
  }
  
-Operand load_lds_size_m0(isel_context *ctx)
+Operand load_lds_size_m0(Builder& bld)
  {
     /* TODO: m0 does not need to be initialized on GFX9+ */
-   Builder bld(ctx->program, ctx->block);
     return bld.m0((Temp)bld.sopk(aco_opcode::s_movk_i32, bld.def(s1, m0), 0xffff));
  }
  
+Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
+                       Temp offset, unsigned bytes_needed,
+                       unsigned align, unsigned const_offset,
+                       Temp dst_hint)
+{
+   offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset;
+
+   Operand m = load_lds_size_m0(bld);
+
+   bool large_ds_read = bld.program->chip_class >= GFX7;
+   bool usable_read2 = bld.program->chip_class >= GFX7;
+
+   bool read2 = false;
+   unsigned size = 0;
+   aco_opcode op;
+   //TODO: use ds_read_u8_d16_hi/ds_read_u16_d16_hi if beneficial
+   if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) {
+      size = 16;
+      op = aco_opcode::ds_read_b128;
+   } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) {
+      size = 16;
+      read2 = true;
+      op = aco_opcode::ds_read2_b64;
+   } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) {
+      size = 12;
+      op = aco_opcode::ds_read_b96;
+   } else if (bytes_needed >= 8 && align % 8 == 0) {
+      size = 8;
+      op = aco_opcode::ds_read_b64;
+   } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0) {
+      size = 8;
+      read2 = true;
+      op = aco_opcode::ds_read2_b32;
+   } else if (bytes_needed >= 4 && align % 4 == 0) {
+      size = 4;
+      op = aco_opcode::ds_read_b32;
+   } else if (bytes_needed >= 2 && align % 2 == 0) {
+      size = 2;
+      op = aco_opcode::ds_read_u16;
+   } else {
+      size = 1;
+      op = aco_opcode::ds_read_u8;
+   }
+
+   unsigned max_offset_plus_one = read2 ? 254 * (size / 2u) + 1 : 65536;
+   if (const_offset >= max_offset_plus_one) {
+      offset = bld.vadd32(bld.def(v1), offset, Operand(const_offset / max_offset_plus_one));
+      const_offset %= max_offset_plus_one;
+   }
+
+   if (read2)
+      const_offset /= (size / 2u);
+
+   RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
+   Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+   Instruction *instr;
+   if (read2)
+      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+   else
+      instr = bld.ds(op, Definition(val), offset, m, const_offset);
+   static_cast<DS_instruction *>(instr)->sync = info->sync;
+
+   if (size < 4)
+      val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
+
+   return val;
+}
+
+static auto emit_lds_load = emit_load<lds_load_callback, false, true, UINT32_MAX>;
+
+Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
+                        Temp offset, unsigned bytes_needed,
+                        unsigned align, unsigned const_offset,
+                        Temp dst_hint)
+{
+   unsigned size = 0;
+   aco_opcode op;
+   if (bytes_needed <= 4) {
+      size = 1;
+      op = info->resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword;
+   } else if (bytes_needed <= 8) {
+      size = 2;
+      op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2;
+   } else if (bytes_needed <= 16) {
+      size = 4;
+      op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4;
+   } else if (bytes_needed <= 32) {
+      size = 8;
+      op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8;
+   } else {
+      size = 16;
+      op = info->resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16;
+   }
+   aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
+   if (info->resource.id()) {
+      load->operands[0] = Operand(info->resource);
+      load->operands[1] = Operand(offset);
+   } else {
+      load->operands[0] = Operand(offset);
+      load->operands[1] = Operand(0u);
+   }
+   RegClass rc(RegType::sgpr, size);
+   Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc);
+   load->definitions[0] = Definition(val);
+   load->glc = info->glc;
+   load->dlc = info->glc && bld.program->chip_class >= GFX10;
+   load->sync = info->sync;
+   bld.insert(std::move(load));
+   return val;
+}
+
+static auto emit_smem_load = emit_load<smem_load_callback, true, false, 1024>;
+
+Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
+                         Temp offset, unsigned bytes_needed,
+                         unsigned align_, unsigned const_offset,
+                         Temp dst_hint)
+{
+   Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+   Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
+
+   if (info->soffset.id()) {
+      if (soffset.isTemp())
+         vaddr = bld.copy(bld.def(v1), soffset);
+      soffset = Operand(info->soffset);
+   }
+
+   unsigned bytes_size = 0;
+   aco_opcode op;
+   if (bytes_needed == 1 || align_ % 2) {
+      bytes_size = 1;
+      op = aco_opcode::buffer_load_ubyte;
+   } else if (bytes_needed == 2 || align_ % 4) {
+      bytes_size = 2;
+      op = aco_opcode::buffer_load_ushort;
+   } else if (bytes_needed <= 4) {
+      bytes_size = 4;
+      op = aco_opcode::buffer_load_dword;
+   } else if (bytes_needed <= 8) {
+      bytes_size = 8;
+      op = aco_opcode::buffer_load_dwordx2;
+   } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) {
+      bytes_size = 12;
+      op = aco_opcode::buffer_load_dwordx3;
+   } else {
+      bytes_size = 16;
+      op = aco_opcode::buffer_load_dwordx4;
+   }
+   aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+   mubuf->operands[0] = Operand(info->resource);
+   mubuf->operands[1] = vaddr;
+   mubuf->operands[2] = soffset;
+   mubuf->offen = (offset.type() == RegType::vgpr);
+   mubuf->glc = info->glc;
+   mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
+   mubuf->sync = info->sync;
+   mubuf->offset = const_offset;
+   mubuf->swizzled = info->swizzle_component_size != 0;
+   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
+   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
+   mubuf->definitions[0] = Definition(val);
+   bld.insert(std::move(mubuf));
+
+   return val;
+}
+
+static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
+
+Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
+{
+   uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
+                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+   if (addr.type() == RegType::vgpr)
+      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
+   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
+}
+
+Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
+                          Temp offset, unsigned bytes_needed,
+                          unsigned align_, unsigned const_offset,
+                          Temp dst_hint)
+{
+   unsigned bytes_size = 0;
+   bool mubuf = bld.program->chip_class == GFX6;
+   bool global = bld.program->chip_class >= GFX9;
+   aco_opcode op;
+   if (bytes_needed == 1) {
+      bytes_size = 1;
+      op = mubuf ? aco_opcode::buffer_load_ubyte : global ? aco_opcode::global_load_ubyte : aco_opcode::flat_load_ubyte;
+   } else if (bytes_needed == 2) {
+      bytes_size = 2;
+      op = mubuf ? aco_opcode::buffer_load_ushort : global ? aco_opcode::global_load_ushort : aco_opcode::flat_load_ushort;
+   } else if (bytes_needed <= 4) {
+      bytes_size = 4;
+      op = mubuf ? aco_opcode::buffer_load_dword : global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
+   } else if (bytes_needed <= 8) {
+      bytes_size = 8;
+      op = mubuf ? aco_opcode::buffer_load_dwordx2 : global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
+   } else if (bytes_needed <= 12 && !mubuf) {
+      bytes_size = 12;
+      op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
+   } else {
+      bytes_size = 16;
+      op = mubuf ? aco_opcode::buffer_load_dwordx4 : global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
+   }
+   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+   Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
+   if (mubuf) {
+      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
+      mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset));
+      mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
+      mubuf->operands[2] = Operand(0u);
+      mubuf->glc = info->glc;
+      mubuf->dlc = false;
+      mubuf->offset = 0;
+      mubuf->addr64 = offset.type() == RegType::vgpr;
+      mubuf->disable_wqm = false;
+      mubuf->sync = info->sync;
+      mubuf->definitions[0] = Definition(val);
+      bld.insert(std::move(mubuf));
+   } else {
+      offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset;
+
+      aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
+      flat->operands[0] = Operand(offset);
+      flat->operands[1] = Operand(s1);
+      flat->glc = info->glc;
+      flat->dlc = info->glc && bld.program->chip_class >= GFX10;
+      flat->sync = info->sync;
+      flat->offset = 0u;
+      flat->definitions[0] = Definition(val);
+      bld.insert(std::move(flat));
+   }
+
+   return val;
+}
+
+static auto emit_global_load = emit_load<global_load_callback, true, true, 1>;
+
  Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
                Temp address, unsigned base_offset, unsigned align)
  {
-   assert(util_is_power_of_two_nonzero(align) && align >= 4);
+   assert(util_is_power_of_two_nonzero(align));
  
     Builder bld(ctx->program, ctx->block);
  
-   Operand m = load_lds_size_m0(ctx);
+   unsigned num_components = dst.bytes() / elem_size_bytes;
+   LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
+   info.align_mul = align;
+   info.align_offset = 0;
+   info.sync = memory_sync_info(storage_shared);
+   info.const_offset = base_offset;
+   emit_lds_load(ctx, bld, &info);
  
-   unsigned num_components = dst.size() * 4u / elem_size_bytes;
-   unsigned bytes_read = 0;
-   unsigned result_size = 0;
-   unsigned total_bytes = num_components * elem_size_bytes;
-   std::array<Temp, NIR_MAX_VEC_COMPONENTS> result;
-   bool large_ds_read = ctx->options->chip_class >= GFX7;
-   bool usable_read2 = ctx->options->chip_class >= GFX7;
-
-   while (bytes_read < total_bytes) {
-      unsigned todo = total_bytes - bytes_read;
-      bool aligned8 = bytes_read % 8 == 0 && align % 8 == 0;
-      bool aligned16 = bytes_read % 16 == 0 && align % 16 == 0;
-
-      aco_opcode op = aco_opcode::last_opcode;
-      bool read2 = false;
-      if (todo >= 16 && aligned16 && large_ds_read) {
-         op = aco_opcode::ds_read_b128;
-         todo = 16;
-      } else if (todo >= 16 && aligned8 && usable_read2) {
-         op = aco_opcode::ds_read2_b64;
-         read2 = true;
-         todo = 16;
-      } else if (todo >= 12 && aligned16 && large_ds_read) {
-         op = aco_opcode::ds_read_b96;
-         todo = 12;
-      } else if (todo >= 8 && aligned8) {
-         op = aco_opcode::ds_read_b64;
-         todo = 8;
-      } else if (todo >= 8 && usable_read2) {
-         op = aco_opcode::ds_read2_b32;
-         read2 = true;
-         todo = 8;
-      } else if (todo >= 4) {
-         op = aco_opcode::ds_read_b32;
-         todo = 4;
-      } else {
-         assert(false);
-      }
-      assert(todo % elem_size_bytes == 0);
-      unsigned num_elements = todo / elem_size_bytes;
-      unsigned offset = base_offset + bytes_read;
-      unsigned max_offset = read2 ? 1019 : 65535;
+   return dst;
+}
  
-      Temp address_offset = address;
-      if (offset > max_offset) {
-         address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
-         offset = bytes_read;
-      }
-      assert(offset <= max_offset); /* bytes_read shouldn't be large enough for this to happen */
+void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp *dst, unsigned *offsets, Temp src)
+{
+   if (!count)
+      return;
  
-      Temp res;
-      if (num_components == 1 && dst.type() == RegType::vgpr)
-         res = dst;
-      else
-         res = bld.tmp(RegClass(RegType::vgpr, todo / 4));
+   Builder bld(ctx->program, ctx->block);
  
-      if (read2)
-         res = bld.ds(op, Definition(res), address_offset, m, offset / (todo / 2), (offset / (todo / 2)) + 1);
+   ASSERTED bool is_subdword = false;
+   for (unsigned i = 0; i < count; i++)
+      is_subdword |= offsets[i] % 4;
+   is_subdword |= (src.bytes() - offsets[count - 1]) % 4;
+   assert(!is_subdword || dst_type == RegType::vgpr);
+
+   /* count == 1 fast path */
+   if (count == 1) {
+      if (dst_type == RegType::sgpr)
+         dst[0] = bld.as_uniform(src);
        else
-         res = bld.ds(op, Definition(res), address_offset, m, offset);
+         dst[0] = as_vgpr(ctx, src);
+      return;
+   }
  
-      if (num_components == 1) {
-         assert(todo == total_bytes);
-         if (dst.type() == RegType::sgpr)
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), res);
-         return dst;
-      }
+   for (unsigned i = 0; i < count - 1; i++)
+      dst[i] = bld.tmp(RegClass::get(dst_type, offsets[i + 1] - offsets[i]));
+   dst[count - 1] = bld.tmp(RegClass::get(dst_type, src.bytes() - offsets[count - 1]));
  
-      if (dst.type() == RegType::sgpr) {
-         Temp new_res = bld.tmp(RegType::sgpr, res.size());
-         expand_vector(ctx, res, new_res, res.size(), (1 << res.size()) - 1);
-         res = new_res;
-      }
+   if (is_subdword && src.type() == RegType::sgpr) {
+      src = as_vgpr(ctx, src);
+   } else {
+      /* use allocated_vec if possible */
+      auto it = ctx->allocated_vec.find(src.id());
+      if (it != ctx->allocated_vec.end()) {
+         if (!it->second[0].id())
+            goto split;
+         unsigned elem_size = it->second[0].bytes();
+         assert(src.bytes() % elem_size == 0);
+
+         for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+            if (!it->second[i].id())
+               goto split;
+         }
  
-      if (num_elements == 1) {
-         result[result_size++] = res;
-      } else {
-         assert(res != dst && res.size() % num_elements == 0);
-         aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elements)};
-         split->operands[0] = Operand(res);
-         for (unsigned i = 0; i < num_elements; i++)
-            split->definitions[i] = Definition(result[result_size++] = bld.tmp(res.type(), elem_size_bytes / 4));
-         ctx->block->instructions.emplace_back(std::move(split));
-      }
+         for (unsigned i = 0; i < count; i++) {
+            if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
+               goto split;
+         }
+
+         for (unsigned i = 0; i < count; i++) {
+            unsigned start_idx = offsets[i] / elem_size;
+            unsigned op_count = dst[i].bytes() / elem_size;
+            if (op_count == 1) {
+               if (dst_type == RegType::sgpr)
+                  dst[i] = bld.as_uniform(it->second[start_idx]);
+               else
+                  dst[i] = as_vgpr(ctx, it->second[start_idx]);
+               continue;
+            }
  
-      bytes_read += todo;
+            aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, op_count, 1)};
+            for (unsigned j = 0; j < op_count; j++) {
+               Temp tmp = it->second[start_idx + j];
+               if (dst_type == RegType::sgpr)
+                  tmp = bld.as_uniform(tmp);
+               vec->operands[j] = Operand(tmp);
+            }
+            vec->definitions[0] = Definition(dst[i]);
+            bld.insert(std::move(vec));
+         }
+         return;
+      }
     }
  
-   assert(result_size == num_components && result_size > 1);
-   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, result_size, 1)};
-   for (unsigned i = 0; i < result_size; i++)
-      vec->operands[i] = Operand(result[i]);
-   vec->definitions[0] = Definition(dst);
-   ctx->block->instructions.emplace_back(std::move(vec));
-   ctx->allocated_vec.emplace(dst.id(), result);
+   split:
  
-   return dst;
+   if (dst_type == RegType::sgpr)
+      src = bld.as_uniform(src);
+
+   /* just split it */
+   aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
+   split->operands[0] = Operand(src);
+   for (unsigned i = 0; i < count; i++)
+      split->definitions[i] = Definition(dst[i]);
+   bld.insert(std::move(split));
  }
  
-Temp extract_subvector(isel_context *ctx, Temp data, unsigned start, unsigned size, RegType type)
+bool scan_write_mask(uint32_t mask, uint32_t todo_mask,
+                     int *start, int *count)
  {
-   if (start == 0 && size == data.size())
-      return type == RegType::vgpr ? as_vgpr(ctx, data) : data;
-
-   unsigned size_hint = 1;
-   auto it = ctx->allocated_vec.find(data.id());
-   if (it != ctx->allocated_vec.end())
-      size_hint = it->second[0].size();
-   if (size % size_hint || start % size_hint)
-      size_hint = 1;
+   unsigned start_elem = ffs(todo_mask) - 1;
+   bool skip = !(mask & (1 << start_elem));
+   if (skip)
+      mask = ~mask & todo_mask;
  
-   start /= size_hint;
-   size /= size_hint;
+   mask &= todo_mask;
  
-   Temp elems[size];
-   for (unsigned i = 0; i < size; i++)
-      elems[i] = emit_extract_vector(ctx, data, start + i, RegClass(type, size_hint));
+   u_bit_scan_consecutive_range(&mask, start, count);
  
-   if (size == 1)
-      return type == RegType::vgpr ? as_vgpr(ctx, elems[0]) : elems[0];
+   return !skip;
+}
  
-   aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, size, 1)};
-   for (unsigned i = 0; i < size; i++)
-      vec->operands[i] = Operand(elems[i]);
-   Temp res = {ctx->program->allocateId(), RegClass(type, size * size_hint)};
-   vec->definitions[0] = Definition(res);
-   ctx->block->instructions.emplace_back(std::move(vec));
-   return res;
+void advance_write_mask(uint32_t *todo_mask, int start, int count)
+{
+   *todo_mask &= ~u_bit_consecutive(0, count) << start;
  }
  
-void ds_write_helper(isel_context *ctx, Operand m, Temp address, Temp data, unsigned data_start, unsigned total_size, unsigned offset0, unsigned offset1, unsigned align)
+void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
+               Temp address, unsigned base_offset, unsigned align)
  {
+   assert(util_is_power_of_two_nonzero(align));
+   assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8);
+
     Builder bld(ctx->program, ctx->block);
-   unsigned bytes_written = 0;
     bool large_ds_write = ctx->options->chip_class >= GFX7;
     bool usable_write2 = ctx->options->chip_class >= GFX7;
  
-   while (bytes_written < total_size * 4) {
-      unsigned todo = total_size * 4 - bytes_written;
-      bool aligned8 = bytes_written % 8 == 0 && align % 8 == 0;
-      bool aligned16 = bytes_written % 16 == 0 && align % 16 == 0;
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   aco_opcode opcodes[32];
+
+   wrmask = widen_mask(wrmask, elem_size_bytes);
+
+   uint32_t todo = u_bit_consecutive(0, data.bytes());
+   while (todo) {
+      int offset, bytes;
+      if (!scan_write_mask(wrmask, todo, &offset, &bytes)) {
+         offsets[write_count] = offset;
+         opcodes[write_count] = aco_opcode::num_opcodes;
+         write_count++;
+         advance_write_mask(&todo, offset, bytes);
+         continue;
+      }
+
+      bool aligned2 = offset % 2 == 0 && align % 2 == 0;
+      bool aligned4 = offset % 4 == 0 && align % 4 == 0;
+      bool aligned8 = offset % 8 == 0 && align % 8 == 0;
+      bool aligned16 = offset % 16 == 0 && align % 16 == 0;
  
-      aco_opcode op = aco_opcode::last_opcode;
-      bool write2 = false;
-      unsigned size = 0;
-      if (todo >= 16 && aligned16 && large_ds_write) {
+      //TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial
+      aco_opcode op = aco_opcode::num_opcodes;
+      if (bytes >= 16 && aligned16 && large_ds_write) {
           op = aco_opcode::ds_write_b128;
-         size = 4;
-      } else if (todo >= 16 && aligned8 && usable_write2) {
-         op = aco_opcode::ds_write2_b64;
-         write2 = true;
-         size = 4;
-      } else if (todo >= 12 && aligned16 && large_ds_write) {
+         bytes = 16;
+      } else if (bytes >= 12 && aligned16 && large_ds_write) {
           op = aco_opcode::ds_write_b96;
-         size = 3;
-      } else if (todo >= 8 && aligned8) {
+         bytes = 12;
+      } else if (bytes >= 8 && aligned8) {
           op = aco_opcode::ds_write_b64;
-         size = 2;
-      } else if (todo >= 8 && usable_write2) {
-         op = aco_opcode::ds_write2_b32;
-         write2 = true;
-         size = 2;
-      } else if (todo >= 4) {
+         bytes = 8;
+      } else if (bytes >= 4 && aligned4) {
           op = aco_opcode::ds_write_b32;
-         size = 1;
+         bytes = 4;
+      } else if (bytes >= 2 && aligned2) {
+         op = aco_opcode::ds_write_b16;
+         bytes = 2;
+      } else if (bytes >= 1) {
+         op = aco_opcode::ds_write_b8;
+         bytes = 1;
        } else {
           assert(false);
        }
  
-      unsigned offset = offset0 + offset1 + bytes_written;
-      unsigned max_offset = write2 ? 1020 : 65535;
+      offsets[write_count] = offset;
+      opcodes[write_count] = op;
+      write_count++;
+      advance_write_mask(&todo, offset, bytes);
+   }
+
+   Operand m = load_lds_size_m0(bld);
+
+   split_store_data(ctx, RegType::vgpr, write_count, write_datas, offsets, data);
+
+   for (unsigned i = 0; i < write_count; i++) {
+      aco_opcode op = opcodes[i];
+      if (op == aco_opcode::num_opcodes)
+         continue;
+
+      Temp data = write_datas[i];
+
+      unsigned second = write_count;
+      if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) {
+         for (second = i + 1; second < write_count; second++) {
+            if (opcodes[second] == op && (offsets[second] - offsets[i]) % data.bytes() == 0) {
+               op = data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
+               opcodes[second] = aco_opcode::num_opcodes;
+               break;
+            }
+         }
+      }
+
+      bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64;
+      unsigned write2_off = (offsets[second] - offsets[i]) / data.bytes();
+
+      unsigned inline_offset = base_offset + offsets[i];
+      unsigned max_offset = write2 ? (255 - write2_off) * data.bytes() : 65535;
        Temp address_offset = address;
-      if (offset > max_offset) {
-         address_offset = bld.vadd32(bld.def(v1), Operand(offset0), address_offset);
-         offset = offset1 + bytes_written;
+      if (inline_offset > max_offset) {
+         address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
+         inline_offset = offsets[i];
        }
-      assert(offset <= max_offset); /* offset1 shouldn't be large enough for this to happen */
+      assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
  
+      Instruction *instr;
        if (write2) {
-         Temp val0 = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size / 2, RegType::vgpr);
-         Temp val1 = extract_subvector(ctx, data, data_start + (bytes_written >> 2) + 1, size / 2, RegType::vgpr);
-         bld.ds(op, address_offset, val0, val1, m, offset / size / 2, (offset / size / 2) + 1);
+         Temp second_data = write_datas[second];
+         inline_offset /= data.bytes();
+         instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
        } else {
-         Temp val = extract_subvector(ctx, data, data_start + (bytes_written >> 2), size, RegType::vgpr);
-         bld.ds(op, address_offset, val, m, offset);
+         instr = bld.ds(op, address_offset, data, m, inline_offset);
        }
-
-      bytes_written += size * 4;
+      static_cast<DS_instruction *>(instr)->sync =
+         memory_sync_info(storage_shared);
     }
  }
  
-void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask,
-               Temp address, unsigned base_offset, unsigned align)
+unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
  {
-   assert(util_is_power_of_two_nonzero(align) && align >= 4);
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
-
-   Operand m = load_lds_size_m0(ctx);
+   unsigned align = 16;
+   if (const_offset)
+      align = std::min(align, 1u << (ffs(const_offset) - 1));
  
-   /* we need at most two stores, assuming that the writemask is at most 4 bits wide */
-   assert(wrmask <= 0x0f);
-   int start[2], count[2];
-   u_bit_scan_consecutive_range(&wrmask, &start[0], &count[0]);
-   u_bit_scan_consecutive_range(&wrmask, &start[1], &count[1]);
-   assert(wrmask == 0);
+   return align;
+}
  
-   /* one combined store is sufficient */
-   if (count[0] == count[1] && (align % elem_size_bytes) == 0 && (base_offset % elem_size_bytes) == 0) {
-      Builder bld(ctx->program, ctx->block);
  
-      Temp address_offset = address;
-      if ((base_offset / elem_size_bytes) + start[1] > 255) {
-         address_offset = bld.vadd32(bld.def(v1), Operand(base_offset), address_offset);
-         base_offset = 0;
+aco_opcode get_buffer_store_op(bool smem, unsigned bytes)
+{
+   switch (bytes) {
+   case 1:
+      assert(!smem);
+      return aco_opcode::buffer_store_byte;
+   case 2:
+      assert(!smem);
+      return aco_opcode::buffer_store_short;
+   case 4:
+      return smem ? aco_opcode::s_buffer_store_dword : aco_opcode::buffer_store_dword;
+   case 8:
+      return smem ? aco_opcode::s_buffer_store_dwordx2 : aco_opcode::buffer_store_dwordx2;
+   case 12:
+      assert(!smem);
+      return aco_opcode::buffer_store_dwordx3;
+   case 16:
+      return smem ? aco_opcode::s_buffer_store_dwordx4 : aco_opcode::buffer_store_dwordx4;
+   }
+   unreachable("Unexpected store size");
+   return aco_opcode::num_opcodes;
+}
+
+void split_buffer_store(isel_context *ctx, nir_intrinsic_instr *instr, bool smem, RegType dst_type,
+                        Temp data, unsigned writemask, int swizzle_element_size,
+                        unsigned *write_count, Temp *write_datas, unsigned *offsets)
+{
+   unsigned write_count_with_skips = 0;
+   bool skips[16];
+
+   /* determine how to split the data */
+   unsigned todo = u_bit_consecutive(0, data.bytes());
+   while (todo) {
+      int offset, bytes;
+      skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &bytes);
+      offsets[write_count_with_skips] = offset;
+      if (skips[write_count_with_skips]) {
+         advance_write_mask(&todo, offset, bytes);
+         write_count_with_skips++;
+         continue;
        }
  
-      assert(count[0] == 1);
-      RegClass xtract_rc(RegType::vgpr, elem_size_bytes / 4);
+      /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be
+       * larger than swizzle_element_size */
+      bytes = MIN2(bytes, swizzle_element_size);
+      if (bytes % 4)
+         bytes = bytes > 4 ? bytes & ~0x3 : MIN2(bytes, 2);
  
-      Temp val0 = emit_extract_vector(ctx, data, start[0], xtract_rc);
-      Temp val1 = emit_extract_vector(ctx, data, start[1], xtract_rc);
-      aco_opcode op = elem_size_bytes == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64;
-      base_offset = base_offset / elem_size_bytes;
-      bld.ds(op, address_offset, val0, val1, m,
-             base_offset + start[0], base_offset + start[1]);
-      return;
-   }
+      /* SMEM and GFX6 VMEM can't emit 12-byte stores */
+      if ((ctx->program->chip_class == GFX6 || smem) && bytes == 12)
+         bytes = 8;
  
-   for (unsigned i = 0; i < 2; i++) {
-      if (count[i] == 0)
-         continue;
+      /* dword or larger stores have to be dword-aligned */
+      unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4;
+      unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset;
+      bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0;
+      if (!dword_aligned)
+         bytes = MIN2(bytes, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1);
  
-      unsigned elem_size_words = elem_size_bytes / 4;
-      ds_write_helper(ctx, m, address, data, start[i] * elem_size_words, count[i] * elem_size_words,
-                      base_offset, start[i] * elem_size_bytes, align);
+      advance_write_mask(&todo, offset, bytes);
+      write_count_with_skips++;
     }
-   return;
-}
  
-unsigned calculate_lds_alignment(isel_context *ctx, unsigned const_offset)
-{
-   unsigned align = 16;
-   if (const_offset)
-      align = std::min(align, 1u << (ffs(const_offset) - 1));
+   /* actually split data */
+   split_store_data(ctx, dst_type, write_count_with_skips, write_datas, offsets, data);
  
-   return align;
+   /* remove skips */
+   for (unsigned i = 0; i < write_count_with_skips; i++) {
+      if (skips[i])
+         continue;
+      write_datas[*write_count] = write_datas[i];
+      offsets[*write_count] = offsets[i];
+      (*write_count)++;
+   }
  }
  
-
  Temp create_vec_from_array(isel_context *ctx, Temp arr[], unsigned cnt, RegType reg_type, unsigned elem_size_bytes,
                             unsigned split_cnt = 0u, Temp dst = Temp())
  {
@@ -3656,125 +3918,67 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
  }
  
  void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
-                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+                             unsigned const_offset = 0u, memory_sync_info sync=memory_sync_info(),
+                             bool slc = false, bool swizzled = false)
  {
     assert(vdata.id());
     assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
     assert(vdata.size() >= 1 && vdata.size() <= 4);
  
     Builder bld(ctx->program, ctx->block);
-   aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_store_dword + vdata.size() - 1);
+   aco_opcode op = get_buffer_store_op(false, vdata.bytes());
     const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
  
     Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
     Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
     Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
-                                 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+                                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+                                 /* dlc*/ false, /* slc */ slc);
  
-   static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+   static_cast<MUBUF_instruction *>(r.instr)->sync = sync;
  }
  
  void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
                                     unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
-                                   bool allow_combining = true, bool reorder = true, bool slc = false)
+                                   bool allow_combining = true, memory_sync_info sync=memory_sync_info(), bool slc = false)
  {
     Builder bld(ctx->program, ctx->block);
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
+   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
     assert(write_mask);
+   write_mask = widen_mask(write_mask, elem_size_bytes);
  
-   if (elem_size_bytes == 8) {
-      elem_size_bytes = 4;
-      write_mask = widen_mask(write_mask, 2);
-   }
-
-   while (write_mask) {
-      int start = 0;
-      int count = 0;
-      u_bit_scan_consecutive_range(&write_mask, &start, &count);
-      assert(count > 0);
-      assert(start >= 0);
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask,
+                      allow_combining ? 16 : 4, &write_count, write_datas, offsets);
  
-      while (count > 0) {
-         unsigned sub_count = allow_combining ? MIN2(count, 4) : 1;
-         unsigned const_offset = (unsigned) start * elem_size_bytes + base_const_offset;
-
-         /* GFX6 doesn't have buffer_store_dwordx3, so make sure not to emit that here either. */
-         if (unlikely(ctx->program->chip_class == GFX6 && sub_count == 3))
-            sub_count = 2;
-
-         Temp elem = extract_subvector(ctx, src, start, sub_count, RegType::vgpr);
-         emit_single_mubuf_store(ctx, descriptor, voffset, soffset, elem, const_offset, reorder, slc);
-
-         count -= sub_count;
-         start += sub_count;
-      }
-
-      assert(count == 0);
+   for (unsigned i = 0; i < write_count; i++) {
+      unsigned const_offset = offsets[i] + base_const_offset;
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, slc, !allow_combining);
     }
  }
  
-Temp emit_single_mubuf_load(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset,
-                            unsigned const_offset, unsigned size_dwords, bool allow_reorder = true)
-{
-   assert(size_dwords != 3 || ctx->program->chip_class != GFX6);
-   assert(size_dwords >= 1 && size_dwords <= 4);
-
-   Builder bld(ctx->program, ctx->block);
-   Temp vdata = bld.tmp(RegClass(RegType::vgpr, size_dwords));
-   aco_opcode op = (aco_opcode) ((unsigned) aco_opcode::buffer_load_dword + size_dwords - 1);
-   const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset);
-
-   Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
-   Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
-   Builder::Result r = bld.mubuf(op, Definition(vdata), Operand(descriptor), voffset_op, soffset_op, const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
-                                 /* disable_wqm */ false, /* glc */ true,
-                                 /* dlc*/ ctx->program->chip_class >= GFX10, /* slc */ false);
-
-   static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
-
-   return vdata;
-}
-
  void load_vmem_mubuf(isel_context *ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset,
                       unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components,
                       unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true)
  {
-   assert(elem_size_bytes == 4 || elem_size_bytes == 8);
-   assert((num_components * elem_size_bytes / 4) == dst.size());
+   assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
+   assert((num_components * elem_size_bytes) == dst.bytes());
     assert(!!stride != allow_combining);
  
     Builder bld(ctx->program, ctx->block);
-   unsigned split_cnt = num_components;
  
-   if (elem_size_bytes == 8) {
-      elem_size_bytes = 4;
-      num_components *= 2;
-   }
-
-   if (!stride)
-      stride = elem_size_bytes;
-
-   unsigned load_size = 1;
-   if (allow_combining) {
-      if ((num_components % 4) == 0)
-         load_size = 4;
-      else if ((num_components % 3) == 0 && ctx->program->chip_class != GFX6)
-         load_size = 3;
-      else if ((num_components % 2) == 0)
-         load_size = 2;
-   }
-
-   unsigned num_loads = num_components / load_size;
-   std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
-
-   for (unsigned i = 0; i < num_loads; ++i) {
-      unsigned const_offset = i * stride * load_size + base_const_offset;
-      elems[i] = emit_single_mubuf_load(ctx, descriptor, voffset, soffset, const_offset, load_size, allow_reorder);
-   }
-
-   create_vec_from_array(ctx, elems.data(), num_loads, RegType::vgpr, load_size * 4u, split_cnt, dst);
+   LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor};
+   info.component_stride = allow_combining ? 0 : stride;
+   info.glc = true;
+   info.swizzle_component_size = allow_combining ? 0 : 4;
+   info.align_mul = MIN2(elem_size_bytes, 4);
+   info.align_offset = 0;
+   info.soffset = soffset;
+   info.const_offset = base_const_offset;
+   emit_mubuf_load(ctx, bld, &info);
  }
  
  std::pair<Temp, unsigned> offset_add_from_nir(isel_context *ctx, const std::pair<Temp, unsigned> &base_offset, nir_src *off_src, unsigned stride = 1u)
@@ -3906,11 +4110,9 @@ std::pair<Temp, unsigned> get_tcs_output_lds_offset(isel_context *ctx, nir_intri
     Builder bld(ctx->program, ctx->block);
  
     uint32_t input_patch_size = ctx->args->options->key.tcs.input_vertices * ctx->tcs_num_inputs * 16;
-   uint32_t num_tcs_outputs = util_last_bit64(ctx->args->shader_info->tcs.outputs_written);
-   uint32_t num_tcs_patch_outputs = util_last_bit64(ctx->args->shader_info->tcs.patch_outputs_written);
-   uint32_t output_vertex_size = num_tcs_outputs * 16;
+   uint32_t output_vertex_size = ctx->tcs_num_outputs * 16;
     uint32_t pervertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
-   uint32_t output_patch_stride = pervertex_output_patch_size + num_tcs_patch_outputs * 16;
+   uint32_t output_patch_stride = pervertex_output_patch_size + ctx->tcs_num_patch_outputs * 16;
  
     std::pair<Temp, unsigned> offs = instr
                                      ? get_intrinsic_io_basic_offset(ctx, instr, 4u)
@@ -3958,11 +4160,7 @@ std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx
  {
     Builder bld(ctx->program, ctx->block);
  
-   unsigned num_tcs_outputs = ctx->shader->info.stage == MESA_SHADER_TESS_CTRL
-                              ? util_last_bit64(ctx->args->shader_info->tcs.outputs_written)
-                              : ctx->args->options->key.tes.tcs_num_outputs;
-
-   unsigned output_vertex_size = num_tcs_outputs * 16;
+   unsigned output_vertex_size = ctx->tcs_num_outputs * 16;
     unsigned per_vertex_output_patch_size = ctx->shader->info.tess.tcs_vertices_out * output_vertex_size;
     unsigned per_patch_data_offset = per_vertex_output_patch_size * ctx->tcs_num_patches;
     unsigned attr_stride = ctx->tcs_num_patches;
@@ -3983,10 +4181,12 @@ std::pair<Temp, unsigned> get_tcs_per_patch_output_vmem_offset(isel_context *ctx
  
  bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr *instr, bool per_vertex, uint64_t mask, bool *indirect)
  {
+   assert(per_vertex || ctx->shader->info.stage == MESA_SHADER_TESS_CTRL);
+
     if (mask == 0)
        return false;
  
-   unsigned off = nir_intrinsic_base(instr) * 4u;
+   unsigned drv_loc = nir_intrinsic_base(instr);
     nir_src *off_src = nir_get_io_offset_src(instr);
  
     if (!nir_src_is_const(*off_src)) {
@@ -3995,15 +4195,10 @@ bool tcs_driver_location_matches_api_mask(isel_context *ctx, nir_intrinsic_instr
     }
  
     *indirect = false;
-   off += nir_src_as_uint(*off_src) * 16u;
-
-   while (mask) {
-      unsigned slot = u_bit_scan64(&mask) + (per_vertex ? 0 : VARYING_SLOT_PATCH0);
-      if (off == shader_io_get_unique_index((gl_varying_slot) slot) * 16u)
-         return true;
-   }
-
-   return false;
+   uint64_t slot = per_vertex
+                   ? ctx->output_drv_loc_to_var_slot[ctx->shader->info.stage][drv_loc / 4]
+                   : (ctx->output_tcs_patch_drv_loc_to_var_slot[drv_loc / 4] - VARYING_SLOT_PATCH0);
+   return (((uint64_t) 1) << slot) & mask;
  }
  
  bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -4022,10 +4217,12 @@ bool store_output_to_temps(isel_context *ctx, nir_intrinsic_instr *instr)
     if (instr->src[0].ssa->bit_size == 64)
        write_mask = widen_mask(write_mask, 2);
  
+   RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1;
+
     for (unsigned i = 0; i < 8; ++i) {
        if (write_mask & (1 << i)) {
           ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u);
-         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, v1);
+         ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc);
        }
        idx++;
     }
@@ -4079,7 +4276,7 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
        /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
        Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
        Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
-      store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
+      store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, memory_sync_info(), true);
     } else {
        Temp lds_base;
  
@@ -4097,9 +4294,8 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
           /* GFX6-8: VS runs on LS stage when tessellation is used, but LS shares LDS space with HS.
            * GFX9+: LS is merged into HS, but still uses the same LDS layout.
            */
-         unsigned num_tcs_inputs = util_last_bit64(ctx->args->shader_info->vs.ls_outputs_written);
           Temp vertex_idx = get_arg(ctx, ctx->args->rel_auto_id);
-         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, num_tcs_inputs * 16u);
+         lds_base = bld.v_mul24_imm(bld.def(v1), vertex_idx, ctx->tcs_num_inputs * 16u);
        } else {
           unreachable("Invalid LS or ES stage");
        }
@@ -4165,7 +4361,7 @@ void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool
  
        Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
        Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
-      store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
+      store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, memory_sync_info(storage_vmem_output));
     }
  
     if (write_to_lds) {
@@ -4200,9 +4396,7 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
         ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
        bool stored_to_temps = store_output_to_temps(ctx, instr);
        if (!stored_to_temps) {
-         fprintf(stderr, "Unimplemented output offset instruction:\n");
-         nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
           abort();
        }
     } else if (ctx->stage == vertex_es ||
@@ -4230,10 +4424,40 @@ void emit_interp_instr(isel_context *ctx, unsigned idx, unsigned component, Temp
     Temp coord2 = emit_extract_vector(ctx, src, 1, v1);
  
     Builder bld(ctx->program, ctx->block);
-   Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, bld.m0(prim_mask), idx, component);
-   if (ctx->program->has_16bank_lds)
-      interp_p1.instr->operands[0].setLateKill(true);
-   bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, component);
+
+   if (dst.regClass() == v2b) {
+      if (ctx->program->has_16bank_lds) {
+         assert(ctx->options->chip_class <= GFX8);
+         Builder::Result interp_p1 =
+            bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1),
+                       Operand(2u) /* P0 */, bld.m0(prim_mask), idx, component);
+         interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b),
+                                coord1, bld.m0(prim_mask), interp_p1, idx, component);
+         bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2,
+                 bld.m0(prim_mask), interp_p1, idx, component);
+      } else {
+         aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16;
+
+         if (ctx->options->chip_class == GFX8)
+            interp_p2_op = aco_opcode::v_interp_p2_legacy_f16;
+
+         Builder::Result interp_p1 =
+            bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1),
+                       coord1, bld.m0(prim_mask), idx, component);
+         bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask),
+                    interp_p1, idx, component);
+      }
+   } else {
+      Builder::Result interp_p1 =
+         bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1,
+                    bld.m0(prim_mask), idx, component);
+
+      if (ctx->program->has_16bank_lds)
+         interp_p1.instr->operands[0].setLateKill(true);
+
+      bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2,
+                 bld.m0(prim_mask), interp_p1, idx, component);
+   }
  }
  
  void emit_load_frag_coord(isel_context *ctx, Temp dst, unsigned num_components)
@@ -4389,9 +4613,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
  
        nir_instr *off_instr = instr->src[0].ssa->parent_instr;
        if (off_instr->type != nir_instr_type_load_const) {
-         fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
-         nir_print_instr(off_instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(off_instr, "Unimplemented nir_intrinsic_load_input offset");
        }
        uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
  
@@ -4399,6 +4621,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
  
        unsigned location = nir_intrinsic_base(instr) / 4 - VERT_ATTRIB_GENERIC0 + offset;
        unsigned component = nir_intrinsic_component(instr);
+      unsigned bitsize = instr->dest.ssa.bit_size;
        unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location];
        uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location];
        uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location];
@@ -4455,7 +4678,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
  
        /* load channels */
        while (channel_start < num_channels) {
-         unsigned fetch_size = num_channels - channel_start;
+         unsigned fetch_component = num_channels - channel_start;
           unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size;
           bool expanded = false;
  
@@ -4467,15 +4690,17 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
                            vtx_info->chan_byte_size == 4;
           unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID;
           if (!use_mubuf) {
-            fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_size);
+            fetch_dfmt = get_fetch_data_format(ctx, vtx_info, fetch_offset, attrib_stride, &fetch_component);
           } else {
-            if (fetch_size == 3 && ctx->options->chip_class == GFX6) {
+            if (fetch_component == 3 && ctx->options->chip_class == GFX6) {
                 /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */
-               fetch_size = 4;
+               fetch_component = 4;
                 expanded = true;
              }
           }
  
+         unsigned fetch_bytes = fetch_component * bitsize / 8;
+
           Temp fetch_index = index;
           if (attrib_stride != 0 && fetch_offset > attrib_stride) {
              fetch_index = bld.vadd32(bld.def(v1), Operand(fetch_offset / attrib_stride), fetch_index);
@@ -4489,19 +4714,37 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
           }
  
           aco_opcode opcode;
-         switch (fetch_size) {
-         case 1:
-            opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
-            break;
+         switch (fetch_bytes) {
           case 2:
-            opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+            assert(!use_mubuf && bitsize == 16);
+            opcode = aco_opcode::tbuffer_load_format_d16_x;
+            break;
+         case 4:
+            if (bitsize == 16) {
+               assert(!use_mubuf);
+               opcode = aco_opcode::tbuffer_load_format_d16_xy;
+            } else {
+               opcode = use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x;
+            }
+            break;
+         case 6:
+            assert(!use_mubuf && bitsize == 16);
+            opcode = aco_opcode::tbuffer_load_format_d16_xyz;
+            break;
+         case 8:
+            if (bitsize == 16) {
+               assert(!use_mubuf);
+               opcode = aco_opcode::tbuffer_load_format_d16_xyzw;
+            } else {
+               opcode = use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy;
+            }
              break;
-         case 3:
+         case 12:
              assert(ctx->options->chip_class >= GFX7 ||
                     (!use_mubuf && ctx->options->chip_class == GFX6));
              opcode = use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz;
              break;
-         case 4:
+         case 16:
              opcode = use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw;
              break;
           default:
@@ -4509,37 +4752,36 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
           }
  
           Temp fetch_dst;
-         if (channel_start == 0 && fetch_size == dst.size() && !post_shuffle &&
+         if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle &&
               !expanded && (alpha_adjust == RADV_ALPHA_ADJUST_NONE ||
                             num_channels <= 3)) {
              direct_fetch = true;
              fetch_dst = dst;
           } else {
-            fetch_dst = bld.tmp(RegType::vgpr, fetch_size);
+            fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes));
           }
  
           if (use_mubuf) {
-            Instruction *mubuf = bld.mubuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, true).instr;
-            static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
+            bld.mubuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_offset, false, false, true).instr;
           } else {
-            Instruction *mtbuf = bld.mtbuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_dfmt, nfmt, fetch_offset, false, true).instr;
-            static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
+            bld.mtbuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_dfmt, nfmt, fetch_offset, false, true).instr;
           }
  
           emit_split_vector(ctx, fetch_dst, fetch_dst.size());
  
-         if (fetch_size == 1) {
+         if (fetch_component == 1) {
              channels[channel_start] = fetch_dst;
           } else {
-            for (unsigned i = 0; i < MIN2(fetch_size, num_channels - channel_start); i++)
-               channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i, v1);
+            for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++)
+               channels[channel_start + i] = emit_extract_vector(ctx, fetch_dst, i,
+                                                                 bitsize == 16 ? v2b : v1);
           }
  
-         channel_start += fetch_size;
+         channel_start += fetch_component;
        }
  
        if (!direct_fetch) {
@@ -4583,9 +4825,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
        nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
        if (off_instr->type != nir_instr_type_load_const ||
            nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
-         fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
-         nir_print_instr(off_instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(off_instr, "Unimplemented nir_intrinsic_load_input offset");
        }
  
        Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
@@ -4819,7 +5059,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
     Temp index = get_ssa_temp(ctx, instr->src[0].ssa);
-   if (!ctx->divergent_vals[instr->dest.ssa.index])
+   if (!nir_dest_is_divergent(instr->dest))
        index = bld.as_uniform(index);
     unsigned desc_set = nir_intrinsic_desc_set(instr);
     unsigned binding = nir_intrinsic_binding(instr);
@@ -4878,236 +5118,24 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
  }
  
  void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
-                 Temp dst, Temp rsrc, Temp offset, int byte_align,
-                 bool glc=false, bool readonly=true)
+                 Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
+                 bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
  {
     Builder bld(ctx->program, ctx->block);
-   bool dlc = glc && ctx->options->chip_class >= GFX10;
-   unsigned num_bytes = num_components * component_size;
-
-   aco_opcode op;
-   if (dst.type() == RegType::vgpr || ((ctx->options->chip_class < GFX8 || component_size < 4) && !readonly)) {
-      Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
-      Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
-      unsigned const_offset = 0;
-
-      /* for small bit sizes add buffer for unaligned loads */
-      if (byte_align) {
-         if (num_bytes > 2)
-            num_bytes += byte_align == -1 ? 4 - component_size : byte_align;
-         else
-            byte_align = 0;
-      }
-
-      Temp lower = Temp();
-      if (num_bytes > 16) {
-         assert(num_components == 3 || num_components == 4);
-         op = aco_opcode::buffer_load_dwordx4;
-         lower = bld.tmp(v4);
-         aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
-         mubuf->definitions[0] = Definition(lower);
-         mubuf->operands[0] = Operand(rsrc);
-         mubuf->operands[1] = vaddr;
-         mubuf->operands[2] = soffset;
-         mubuf->offen = (offset.type() == RegType::vgpr);
-         mubuf->glc = glc;
-         mubuf->dlc = dlc;
-         mubuf->barrier = readonly ? barrier_none : barrier_buffer;
-         mubuf->can_reorder = readonly;
-         bld.insert(std::move(mubuf));
-         emit_split_vector(ctx, lower, 2);
-         num_bytes -= 16;
-         const_offset = 16;
-      } else if (num_bytes == 12 && ctx->options->chip_class == GFX6) {
-         /* GFX6 doesn't support loading vec3, expand to vec4. */
-         num_bytes = 16;
-      }
-
-      switch (num_bytes) {
-         case 1:
-            op = aco_opcode::buffer_load_ubyte;
-            break;
-         case 2:
-            op = aco_opcode::buffer_load_ushort;
-            break;
-         case 3:
-         case 4:
-            op = aco_opcode::buffer_load_dword;
-            break;
-         case 5:
-         case 6:
-         case 7:
-         case 8:
-            op = aco_opcode::buffer_load_dwordx2;
-            break;
-         case 10:
-         case 12:
-            assert(ctx->options->chip_class > GFX6);
-            op = aco_opcode::buffer_load_dwordx3;
-            break;
-         case 16:
-            op = aco_opcode::buffer_load_dwordx4;
-            break;
-         default:
-            unreachable("Load SSBO not implemented for this size.");
-      }
-      aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
-      mubuf->operands[0] = Operand(rsrc);
-      mubuf->operands[1] = vaddr;
-      mubuf->operands[2] = soffset;
-      mubuf->offen = (offset.type() == RegType::vgpr);
-      mubuf->glc = glc;
-      mubuf->dlc = dlc;
-      mubuf->barrier = readonly ? barrier_none : barrier_buffer;
-      mubuf->can_reorder = readonly;
-      mubuf->offset = const_offset;
-      aco_ptr<Instruction> instr = std::move(mubuf);
-
-      if (component_size < 4) {
-         Temp vec = num_bytes <= 4 ? bld.tmp(v1) : num_bytes <= 8 ? bld.tmp(v2) : bld.tmp(v3);
-         instr->definitions[0] = Definition(vec);
-         bld.insert(std::move(instr));
-
-         if (byte_align == -1 || (byte_align && dst.type() == RegType::sgpr)) {
-            Operand align = byte_align == -1 ? Operand(offset) : Operand((uint32_t)byte_align);
-            Temp tmp[3] = {vec, vec, vec};
-
-            if (vec.size() == 3) {
-               tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1);
-               bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), Definition(tmp[2]), vec);
-            } else if (vec.size() == 2) {
-               tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1];
-               bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec);
-            }
-            for (unsigned i = 0; i < dst.size(); i++)
-               tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], align);
-
-            vec = tmp[0];
-            if (dst.size() == 2)
-               vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]);
-
-            byte_align = 0;
-         }
-
-         if (dst.type() == RegType::vgpr && num_components == 1) {
-            bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), vec, Operand(byte_align / component_size));
-         } else {
-            trim_subdword_vector(ctx, vec, dst, 4 * vec.size() / component_size, ((1 << num_components) - 1) << byte_align / component_size);
-         }
-
-         return;
-
-      } else if (dst.size() > 4) {
-         assert(lower != Temp());
-         Temp upper = bld.tmp(RegType::vgpr, dst.size() - lower.size());
-         instr->definitions[0] = Definition(upper);
-         bld.insert(std::move(instr));
-         if (dst.size() == 8)
-            emit_split_vector(ctx, upper, 2);
-         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, dst.size() / 2, 1));
-         instr->operands[0] = Operand(emit_extract_vector(ctx, lower, 0, v2));
-         instr->operands[1] = Operand(emit_extract_vector(ctx, lower, 1, v2));
-         instr->operands[2] = Operand(emit_extract_vector(ctx, upper, 0, v2));
-         if (dst.size() == 8)
-            instr->operands[3] = Operand(emit_extract_vector(ctx, upper, 1, v2));
-      } else if (dst.size() == 3 && ctx->options->chip_class == GFX6) {
-         Temp vec = bld.tmp(v4);
-         instr->definitions[0] = Definition(vec);
-         bld.insert(std::move(instr));
-         emit_split_vector(ctx, vec, 4);
-
-         instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1));
-         instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1));
-         instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1));
-         instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1));
-      }
-
-      if (dst.type() == RegType::sgpr) {
-         Temp vec = bld.tmp(RegType::vgpr, dst.size());
-         instr->definitions[0] = Definition(vec);
-         bld.insert(std::move(instr));
-         expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
-      } else {
-         instr->definitions[0] = Definition(dst);
-         bld.insert(std::move(instr));
-         emit_split_vector(ctx, dst, num_components);
-      }
-   } else {
-      /* for small bit sizes add buffer for unaligned loads */
-      if (byte_align)
-         num_bytes += byte_align == -1 ? 4 - component_size : byte_align;
  
-      switch (num_bytes) {
-         case 1:
-         case 2:
-         case 3:
-         case 4:
-            op = aco_opcode::s_buffer_load_dword;
-            break;
-         case 5:
-         case 6:
-         case 7:
-         case 8:
-            op = aco_opcode::s_buffer_load_dwordx2;
-            break;
-         case 10:
-         case 12:
-         case 16:
-            op = aco_opcode::s_buffer_load_dwordx4;
-            break;
-         case 24:
-         case 32:
-            op = aco_opcode::s_buffer_load_dwordx8;
-            break;
-         default:
-            unreachable("Load SSBO not implemented for this size.");
-      }
+   bool use_smem = dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem;
+   if (use_smem)
        offset = bld.as_uniform(offset);
-      aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
-      load->operands[0] = Operand(rsrc);
-      load->operands[1] = Operand(offset);
-      assert(load->operands[1].getTemp().type() == RegType::sgpr);
-      load->definitions[0] = Definition(dst);
-      load->glc = glc;
-      load->dlc = dlc;
-      load->barrier = readonly ? barrier_none : barrier_buffer;
-      load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
-      assert(ctx->options->chip_class >= GFX8 || !glc);
-
-      /* adjust misaligned small bit size loads */
-      if (byte_align) {
-         Temp vec = num_bytes <= 4 ? bld.tmp(s1) : num_bytes <= 8 ? bld.tmp(s2) : bld.tmp(s4);
-         load->definitions[0] = Definition(vec);
-         bld.insert(std::move(load));
-         Operand byte_offset = byte_align > 0 ? Operand(uint32_t(byte_align)) : Operand(offset);
-         byte_align_scalar(ctx, vec, byte_offset, dst);
-
-      /* trim vector */
-      } else if (dst.size() == 3) {
-         Temp vec = bld.tmp(s4);
-         load->definitions[0] = Definition(vec);
-         bld.insert(std::move(load));
-         emit_split_vector(ctx, vec, 4);
-
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
-                    emit_extract_vector(ctx, vec, 0, s1),
-                    emit_extract_vector(ctx, vec, 1, s1),
-                    emit_extract_vector(ctx, vec, 2, s1));
-      } else if (dst.size() == 6) {
-         Temp vec = bld.tmp(s8);
-         load->definitions[0] = Definition(vec);
-         bld.insert(std::move(load));
-         emit_split_vector(ctx, vec, 4);
-
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
-                    emit_extract_vector(ctx, vec, 0, s2),
-                    emit_extract_vector(ctx, vec, 1, s2),
-                    emit_extract_vector(ctx, vec, 2, s2));
-      } else {
-         bld.insert(std::move(load));
-      }
-      emit_split_vector(ctx, dst, num_components);
-   }
+
+   LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
+   info.glc = glc;
+   info.sync = sync;
+   info.align_mul = align_mul;
+   info.align_offset = align_offset;
+   if (use_smem)
+      emit_smem_load(ctx, bld, &info);
+   else
+      emit_mubuf_load(ctx, bld, &info);
  }
  
  void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -5146,13 +5174,8 @@ void visit_load_ubo(isel_context *ctx, nir_intrinsic_instr *instr)
        rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
     }
     unsigned size = instr->dest.ssa.bit_size / 8;
-   int byte_align = 0;
-   if (size < 4) {
-      unsigned align_mul = nir_intrinsic_align_mul(instr);
-      unsigned align_offset = nir_intrinsic_align_offset(instr);
-      byte_align = align_mul % 4 == 0 ? align_offset : -1;
-   }
-   load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), byte_align);
+   load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr));
  }
  
  void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -5182,7 +5205,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
     if (offset != 0) // TODO check if index != 0 as well
-      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
     Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
     Temp vec = dst;
     bool trim = false;
@@ -5224,7 +5247,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
        unreachable("unimplemented or forbidden load_push_constant.");
     }
  
-   bld.smem(op, Definition(vec), ptr, index);
+   static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
  
     if (!aligned) {
        Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
@@ -5268,7 +5291,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
     if (base && offset.type() == RegType::sgpr)
-      offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
     else if (base && offset.type() == RegType::vgpr)
        offset = bld.vadd32(bld.def(v1), Operand(base), offset);
  
@@ -5278,8 +5301,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
                            Operand(desc_type));
     unsigned size = instr->dest.ssa.bit_size / 8;
     // TODO: get alignment information for subdword constants
-   unsigned byte_align = size < 4 ? -1 : 0;
-   load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, byte_align);
+   load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0);
  }
  
  void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -5625,12 +5647,15 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
     load->unrm = true;
     load->da = da;
     load->dim = dim;
-   load->can_reorder = true; /* fmask images shouldn't be modified */
     ctx->block->instructions.emplace_back(std::move(load));
  
     Operand sample_index4;
-   if (sample_index.isConstant() && sample_index.constantValue() < 16) {
-      sample_index4 = Operand(sample_index.constantValue() << 2);
+   if (sample_index.isConstant()) {
+      if (sample_index.constantValue() < 16) {
+         sample_index4 = Operand(sample_index.constantValue() << 2);
+      } else {
+         sample_index4 = Operand(0u);
+      }
     } else if (sample_index.regClass() == s1) {
        sample_index4 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), sample_index, Operand(2u));
     } else {
@@ -5721,6 +5746,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr
  }
  
  
+memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+{
+   /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
+   if (semantics & semantic_atomicrmw)
+      return memory_sync_info(storage, semantics);
+
+   unsigned access = nir_intrinsic_access(instr);
+
+   if (access & ACCESS_VOLATILE)
+      semantics |= semantic_volatile;
+   if (access & ACCESS_CAN_REORDER)
+      semantics |= semantic_can_reorder | semantic_private;
+
+   return memory_sync_info(storage, semantics);
+}
+
  void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
@@ -5730,6 +5771,9 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
     bool is_array = glsl_sampler_type_is_array(type);
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
  
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+   unsigned access = var->data.access | nir_intrinsic_access(instr);
+
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
        unsigned num_channels = util_last_bit(mask);
@@ -5764,9 +5808,9 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
           tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
        load->definitions[0] = Definition(tmp);
        load->idxen = true;
-      load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
+      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
        load->dlc = load->glc && ctx->options->chip_class >= GFX10;
-      load->barrier = barrier_image;
+      load->sync = sync;
        ctx->block->instructions.emplace_back(std::move(load));
  
        expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
@@ -5792,13 +5836,13 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
     load->operands[1] = Operand(s4); /* no sampler */
     load->operands[2] = Operand(coords);
     load->definitions[0] = Definition(tmp);
-   load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
+   load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
     load->dlc = load->glc && ctx->options->chip_class >= GFX10;
     load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
     load->dmask = dmask;
     load->unrm = true;
     load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
-   load->barrier = barrier_image;
+   load->sync = sync;
     ctx->block->instructions.emplace_back(std::move(load));
  
     expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
@@ -5813,7 +5857,9 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
     bool is_array = glsl_sampler_type_is_array(type);
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
  
-   bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+   unsigned access = var->data.access | nir_intrinsic_access(instr);
+   bool glc = ctx->options->chip_class == GFX6 || access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
  
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
@@ -5844,7 +5890,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
        store->glc = glc;
        store->dlc = false;
        store->disable_wqm = true;
-      store->barrier = barrier_image;
+      store->sync = sync;
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(store));
        return;
@@ -5868,7 +5914,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
     store->unrm = true;
     store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
     store->disable_wqm = true;
-   store->barrier = barrier_image;
+   store->sync = sync;
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(store));
     return;
@@ -5946,6 +5992,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
  
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
  
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
@@ -5963,7 +6010,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        mubuf->glc = return_previous;
        mubuf->dlc = false; /* Not needed for atomics */
        mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_image;
+      mubuf->sync = sync;
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
        return;
@@ -5984,7 +6031,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     mimg->unrm = true;
     mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
     mimg->disable_wqm = true;
-   mimg->barrier = barrier_image;
+   mimg->sync = sync;
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(mimg));
     return;
@@ -6033,6 +6080,7 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
     }
  
     /* LOD */
+   assert(nir_src_as_uint(instr->src[1]) == 0);
     Temp lod = bld.vop1(aco_opcode::v_mov_b32, bld.def(v1), Operand(0u));
  
     /* Resource */
@@ -6048,7 +6096,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
     mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
     mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
     mimg->da = glsl_sampler_type_is_array(type);
-   mimg->can_reorder = true;
     Definition& def = mimg->definitions[0];
     ctx->block->instructions.emplace_back(std::move(mimg));
  
@@ -6091,15 +6138,20 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
-   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+   unsigned access = nir_intrinsic_access(instr);
+   bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
     unsigned size = instr->dest.ssa.bit_size / 8;
-   int byte_align = 0;
-   if (size < 4) {
-      unsigned align_mul = nir_intrinsic_align_mul(instr);
-      unsigned align_offset = nir_intrinsic_align_offset(instr);
-      byte_align = align_mul % 4 == 0 ? align_offset : -1;
-   }
-   load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), byte_align, glc, false);
+
+   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[0].ssa, access);
+   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+    */
+   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_store : has_vmem_store));
+   allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
+
+   load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
+               get_memory_sync_info(instr, storage_buffer, 0));
  }
  
  void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -6107,142 +6159,75 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Builder bld(ctx->program, ctx->block);
     Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   unsigned writemask = nir_intrinsic_write_mask(instr);
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
     Temp offset = get_ssa_temp(ctx, instr->src[2].ssa);
  
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
-   bool smem = !ctx->divergent_vals[instr->src[2].ssa->index] &&
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+   uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
+   /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
+    * TODO: this optimization is disabled for now because we still need to ensure correct ordering
+    */
+   bool allow_smem = !(flags & (0 && glc ? has_nonglc_vmem_loadstore : has_vmem_loadstore));
+
+   bool smem = !nir_src_is_divergent(instr->src[2]) &&
                 ctx->options->chip_class >= GFX8 &&
-               elem_size_bytes >= 4;
+               ctx->options->chip_class < GFX10_3 &&
+               (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
+               allow_smem;
     if (smem)
        offset = bld.as_uniform(offset);
     bool smem_nonfs = smem && ctx->stage != fragment_fs;
  
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      if (count == 3 && (smem || ctx->options->chip_class == GFX6)) {
-         /* GFX6 doesn't support storing vec3, split it. */
-         writemask |= 1u << (start + 2);
-         count = 2;
-      }
-      int num_bytes = count * elem_size_bytes;
-
-      /* dword or larger stores have to be dword-aligned */
-      if (elem_size_bytes < 4 && num_bytes > 2) {
-         // TODO: improve alignment check of sub-dword stores
-         unsigned count_new = 2 / elem_size_bytes;
-         writemask |= ((1 << (count - count_new)) - 1) << (start + count_new);
-         count = count_new;
-         num_bytes = 2;
-      }
-
-      if (num_bytes > 16) {
-         assert(elem_size_bytes == 8);
-         writemask |= (((count - 2) << 1) - 1) << (start + 2);
-         count = 2;
-         num_bytes = 16;
-      }
-
-      Temp write_data;
-      if (elem_size_bytes < 4) {
-         if (data.type() == RegType::sgpr) {
-            data = as_vgpr(ctx, data);
-            emit_split_vector(ctx, data, 4 * data.size() / elem_size_bytes);
-         }
-         RegClass rc = RegClass(RegType::vgpr, elem_size_bytes).as_subdword();
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++)
-            vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, rc));
-         write_data = bld.tmp(RegClass(RegType::vgpr, num_bytes).as_subdword());
-         vec->definitions[0] = Definition(write_data);
-         bld.insert(std::move(vec));
-      } else if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++) {
-            Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(data.type(), elem_size_bytes / 4));
-            vec->operands[i] = Operand(smem_nonfs ? bld.as_uniform(elem) : elem);
-         }
-         write_data = bld.tmp(!smem ? RegType::vgpr : smem_nonfs ? RegType::sgpr : data.type(), count * elem_size_bytes / 4);
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      } else if (!smem && data.type() != RegType::vgpr) {
-         assert(num_bytes % 4 == 0);
-         write_data = bld.copy(bld.def(RegType::vgpr, num_bytes / 4), data);
-      } else if (smem_nonfs && data.type() == RegType::vgpr) {
-         assert(num_bytes % 4 == 0);
-         write_data = bld.as_uniform(data);
-      } else {
-         write_data = data;
-      }
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, instr, smem, smem_nonfs ? RegType::sgpr : (smem ? data.type() : RegType::vgpr),
+                      data, writemask, 16, &write_count, write_datas, offsets);
  
-      aco_opcode vmem_op, smem_op = aco_opcode::last_opcode;
-      switch (num_bytes) {
-         case 1:
-            vmem_op = aco_opcode::buffer_store_byte;
-            break;
-         case 2:
-            vmem_op = aco_opcode::buffer_store_short;
-            break;
-         case 4:
-            vmem_op = aco_opcode::buffer_store_dword;
-            smem_op = aco_opcode::s_buffer_store_dword;
-            break;
-         case 8:
-            vmem_op = aco_opcode::buffer_store_dwordx2;
-            smem_op = aco_opcode::s_buffer_store_dwordx2;
-            break;
-         case 12:
-            vmem_op = aco_opcode::buffer_store_dwordx3;
-            assert(!smem && ctx->options->chip_class > GFX6);
-            break;
-         case 16:
-            vmem_op = aco_opcode::buffer_store_dwordx4;
-            smem_op = aco_opcode::s_buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("Store SSBO not implemented for this size.");
-      }
-      if (ctx->stage == fragment_fs)
-         smem_op = aco_opcode::p_fs_buffer_store_smem;
+   for (unsigned i = 0; i < write_count; i++) {
+      aco_opcode op = get_buffer_store_op(smem, write_datas[i].bytes());
+      if (smem && ctx->stage == fragment_fs)
+         op = aco_opcode::p_fs_buffer_store_smem;
  
        if (smem) {
-         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(smem_op, Format::SMEM, 3, 0)};
+         aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
           store->operands[0] = Operand(rsrc);
-         if (start) {
-            Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                offset, Operand(start * elem_size_bytes));
+         if (offsets[i]) {
+            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                                      offset, Operand(offsets[i]));
              store->operands[1] = Operand(off);
           } else {
              store->operands[1] = Operand(offset);
           }
-         if (smem_op != aco_opcode::p_fs_buffer_store_smem)
+         if (op != aco_opcode::p_fs_buffer_store_smem)
              store->operands[1].setFixed(m0);
-         store->operands[2] = Operand(write_data);
-         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->operands[2] = Operand(write_datas[i]);
+         store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
           ctx->block->instructions.emplace_back(std::move(store));
           ctx->program->wb_smem_l1_on_end = true;
-         if (smem_op == aco_opcode::p_fs_buffer_store_smem) {
+         if (op == aco_opcode::p_fs_buffer_store_smem) {
              ctx->block->kind |= block_kind_needs_lowering;
              ctx->program->needs_exact = true;
           }
        } else {
-         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(vmem_op, Format::MUBUF, 4, 0)};
+         aco_ptr<MUBUF_instruction> store{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)};
           store->operands[0] = Operand(rsrc);
           store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1);
           store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand((uint32_t) 0);
-         store->operands[3] = Operand(write_data);
-         store->offset = start * elem_size_bytes;
+         store->operands[3] = Operand(write_datas[i]);
+         store->offset = offsets[i];
           store->offen = (offset.type() == RegType::vgpr);
-         store->glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
+         store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(store));
        }
@@ -6333,179 +6318,40 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     mubuf->glc = return_previous;
     mubuf->dlc = false; /* Not needed for atomics */
     mubuf->disable_wqm = true;
-   mubuf->barrier = barrier_buffer;
+   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(mubuf));
  }
  
-void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
-
-   Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
-   Builder bld(ctx->program, ctx->block);
-   Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
-   get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
-}
-
-Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
-{
-   uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) |
-                        S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
-
-   if (addr.type() == RegType::vgpr)
-      return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand(0u), Operand(0u), Operand(-1u), Operand(rsrc_conf));
-   return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand(-1u), Operand(rsrc_conf));
-}
-
-void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
-{
-   Builder bld(ctx->program, ctx->block);
-   unsigned num_components = instr->num_components;
-   unsigned num_bytes = num_components * instr->dest.ssa.bit_size / 8;
-
-   Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
-
-   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
-   bool dlc = glc && ctx->options->chip_class >= GFX10;
-   /* VMEM stores don't update the SMEM cache and it's difficult to prove that
-    * it's safe to use SMEM */
-   bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
-   aco_opcode op;
-   if (dst.type() == RegType::vgpr || (glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
-      bool global = ctx->options->chip_class >= GFX9;
-
-      if (ctx->options->chip_class >= GFX7) {
-         switch (num_bytes) {
-         case 4:
-            op = global ? aco_opcode::global_load_dword : aco_opcode::flat_load_dword;
-            break;
-         case 8:
-            op = global ? aco_opcode::global_load_dwordx2 : aco_opcode::flat_load_dwordx2;
-            break;
-         case 12:
-            op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3;
-            break;
-         case 16:
-            op = global ? aco_opcode::global_load_dwordx4 : aco_opcode::flat_load_dwordx4;
-            break;
-         default:
-            unreachable("load_global not implemented for this size.");
-         }
-
-         aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)};
-         flat->operands[0] = Operand(addr);
-         flat->operands[1] = Operand(s1);
-         flat->glc = glc;
-         flat->dlc = dlc;
-         flat->barrier = barrier_buffer;
-
-         if (dst.type() == RegType::sgpr) {
-            Temp vec = bld.tmp(RegType::vgpr, dst.size());
-            flat->definitions[0] = Definition(vec);
-            ctx->block->instructions.emplace_back(std::move(flat));
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
-         } else {
-            flat->definitions[0] = Definition(dst);
-            ctx->block->instructions.emplace_back(std::move(flat));
-         }
-         emit_split_vector(ctx, dst, num_components);
-      } else {
-         assert(ctx->options->chip_class == GFX6);
-
-         /* GFX6 doesn't support loading vec3, expand to vec4. */
-         num_bytes = num_bytes == 12 ? 16 : num_bytes;
-
-         switch (num_bytes) {
-         case 4:
-            op = aco_opcode::buffer_load_dword;
-            break;
-         case 8:
-            op = aco_opcode::buffer_load_dwordx2;
-            break;
-         case 16:
-            op = aco_opcode::buffer_load_dwordx4;
-            break;
-         default:
-            unreachable("load_global not implemented for this size.");
-         }
-
-         Temp rsrc = get_gfx6_global_rsrc(bld, addr);
+void visit_get_buffer_size(isel_context *ctx, nir_intrinsic_instr *instr) {
  
-         aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)};
-         mubuf->operands[0] = Operand(rsrc);
-         mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
-         mubuf->operands[2] = Operand(0u);
-         mubuf->glc = glc;
-         mubuf->dlc = false;
-         mubuf->offset = 0;
-         mubuf->addr64 = addr.type() == RegType::vgpr;
-         mubuf->disable_wqm = false;
-         mubuf->barrier = barrier_buffer;
-         aco_ptr<Instruction> instr = std::move(mubuf);
-
-         /* expand vector */
-         if (dst.size() == 3) {
-            Temp vec = bld.tmp(v4);
-            instr->definitions[0] = Definition(vec);
-            bld.insert(std::move(instr));
-            emit_split_vector(ctx, vec, 4);
-
-            instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, 3, 1));
-            instr->operands[0] = Operand(emit_extract_vector(ctx, vec, 0, v1));
-            instr->operands[1] = Operand(emit_extract_vector(ctx, vec, 1, v1));
-            instr->operands[2] = Operand(emit_extract_vector(ctx, vec, 2, v1));
-         }
+   Temp index = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
+   Builder bld(ctx->program, ctx->block);
+   Temp desc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), index, Operand(0u));
+   get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa), false);
+}
  
-         if (dst.type() == RegType::sgpr) {
-            Temp vec = bld.tmp(RegType::vgpr, dst.size());
-            instr->definitions[0] = Definition(vec);
-            bld.insert(std::move(instr));
-            expand_vector(ctx, vec, dst, num_components, (1 << num_components) - 1);
-            bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec);
-         } else {
-            instr->definitions[0] = Definition(dst);
-            bld.insert(std::move(instr));
-            emit_split_vector(ctx, dst, num_components);
-         }
-      }
+void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
+{
+   Builder bld(ctx->program, ctx->block);
+   unsigned num_components = instr->num_components;
+   unsigned component_size = instr->dest.ssa.bit_size / 8;
+
+   LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)),
+                        get_ssa_temp(ctx, &instr->dest.ssa),
+                        num_components, component_size};
+   info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
+   info.align_mul = nir_intrinsic_align_mul(instr);
+   info.align_offset = nir_intrinsic_align_offset(instr);
+   info.sync = get_memory_sync_info(instr, storage_buffer, 0);
+   /* VMEM stores don't update the SMEM cache and it's difficult to prove that
+    * it's safe to use SMEM */
+   bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
+   if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || !can_use_smem) {
+      emit_global_load(ctx, bld, &info);
     } else {
-      switch (num_bytes) {
-         case 4:
-            op = aco_opcode::s_load_dword;
-            break;
-         case 8:
-            op = aco_opcode::s_load_dwordx2;
-            break;
-         case 12:
-         case 16:
-            op = aco_opcode::s_load_dwordx4;
-            break;
-         default:
-            unreachable("load_global not implemented for this size.");
-      }
-      aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)};
-      load->operands[0] = Operand(addr);
-      load->operands[1] = Operand(0u);
-      load->definitions[0] = Definition(dst);
-      load->glc = glc;
-      load->dlc = dlc;
-      load->barrier = barrier_buffer;
-      assert(ctx->options->chip_class >= GFX8 || !glc);
-
-      if (dst.size() == 3) {
-         /* trim vector */
-         Temp vec = bld.tmp(s4);
-         load->definitions[0] = Definition(vec);
-         ctx->block->instructions.emplace_back(std::move(load));
-         emit_split_vector(ctx, vec, 4);
-
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst),
-                    emit_extract_vector(ctx, vec, 0, s1),
-                    emit_extract_vector(ctx, vec, 1, s1),
-                    emit_extract_vector(ctx, vec, 2, s1));
-      } else {
-         ctx->block->instructions.emplace_back(std::move(load));
-      }
+      info.offset = Operand(bld.as_uniform(info.offset));
+      emit_smem_load(ctx, bld, &info);
     }
  }
  
@@ -6513,38 +6359,26 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
  
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
+   bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
  
     if (ctx->options->chip_class >= GFX7)
        addr = as_vgpr(ctx, addr);
  
-   unsigned writemask = nir_intrinsic_write_mask(instr);
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      if (count == 3 && ctx->options->chip_class == GFX6) {
-         /* GFX6 doesn't support storing vec3, split it. */
-         writemask |= 1u << (start + 2);
-         count = 2;
-      }
-      unsigned num_bytes = count * elem_size_bytes;
-
-      Temp write_data = data;
-      if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++)
-            vec->operands[i] = Operand(emit_extract_vector(ctx, data, start + i, v1));
-         write_data = bld.tmp(RegType::vgpr, count);
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      }
-
-      bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
-      unsigned offset = start * elem_size_bytes;
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+                      16, &write_count, write_datas, offsets);
  
+   for (unsigned i = 0; i < write_count; i++) {
        if (ctx->options->chip_class >= GFX7) {
+         unsigned offset = offsets[i];
+         Temp store_addr = addr;
           if (offset > 0 && ctx->options->chip_class < GFX9) {
              Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1);
              Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1);
@@ -6557,14 +6391,20 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
                       Operand(0u), addr1,
                       carry).def(1).setHint(vcc);
  
-            addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
+            store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1);
  
              offset = 0;
           }
  
           bool global = ctx->options->chip_class >= GFX9;
           aco_opcode op;
-         switch (num_bytes) {
+         switch (write_datas[i].bytes()) {
+         case 1:
+            op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte;
+            break;
+         case 2:
+            op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short;
+            break;
           case 4:
              op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword;
              break;
@@ -6582,33 +6422,20 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           }
  
           aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)};
-         flat->operands[0] = Operand(addr);
+         flat->operands[0] = Operand(store_addr);
           flat->operands[1] = Operand(s1);
-         flat->operands[2] = Operand(data);
+         flat->operands[2] = Operand(write_datas[i]);
           flat->glc = glc;
           flat->dlc = false;
           flat->offset = offset;
           flat->disable_wqm = true;
-         flat->barrier = barrier_buffer;
+         flat->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(flat));
        } else {
           assert(ctx->options->chip_class == GFX6);
  
-         aco_opcode op;
-         switch (num_bytes) {
-         case 4:
-            op = aco_opcode::buffer_store_dword;
-            break;
-         case 8:
-            op = aco_opcode::buffer_store_dwordx2;
-            break;
-         case 16:
-            op = aco_opcode::buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("store_global not implemented for this size.");
-         }
+         aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
  
           Temp rsrc = get_gfx6_global_rsrc(bld, addr);
  
@@ -6616,13 +6443,13 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           mubuf->operands[0] = Operand(rsrc);
           mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1);
           mubuf->operands[2] = Operand(0u);
-         mubuf->operands[3] = Operand(write_data);
+         mubuf->operands[3] = Operand(write_datas[i]);
           mubuf->glc = glc;
           mubuf->dlc = false;
-         mubuf->offset = offset;
+         mubuf->offset = offsets[i];
           mubuf->addr64 = addr.type() == RegType::vgpr;
           mubuf->disable_wqm = true;
-         mubuf->barrier = barrier_buffer;
+         mubuf->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(mubuf));
        }
@@ -6715,7 +6542,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        flat->dlc = false; /* Not needed for atomics */
        flat->offset = 0;
        flat->disable_wqm = true;
-      flat->barrier = barrier_buffer;
+      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(flat));
     } else {
@@ -6782,40 +6609,63 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        mubuf->offset = 0;
        mubuf->addr64 = addr.type() == RegType::vgpr;
        mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_buffer;
+      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
     }
  }
  
-void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
-   Builder bld(ctx->program, ctx->block);
-   switch(instr->intrinsic) {
-      case nir_intrinsic_group_memory_barrier:
-      case nir_intrinsic_memory_barrier:
-         bld.barrier(aco_opcode::p_memory_barrier_common);
-         break;
-      case nir_intrinsic_memory_barrier_buffer:
-         bld.barrier(aco_opcode::p_memory_barrier_buffer);
-         break;
-      case nir_intrinsic_memory_barrier_image:
-         bld.barrier(aco_opcode::p_memory_barrier_image);
-         break;
-      case nir_intrinsic_memory_barrier_tcs_patch:
-      case nir_intrinsic_memory_barrier_shared:
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         break;
-      default:
-         unreachable("Unimplemented memory barrier intrinsic");
-         break;
+sync_scope translate_nir_scope(nir_scope scope)
+{
+   switch (scope) {
+   case NIR_SCOPE_NONE:
+   case NIR_SCOPE_INVOCATION:
+      return scope_invocation;
+   case NIR_SCOPE_SUBGROUP:
+      return scope_subgroup;
+   case NIR_SCOPE_WORKGROUP:
+      return scope_workgroup;
+   case NIR_SCOPE_QUEUE_FAMILY:
+      return scope_queuefamily;
+   case NIR_SCOPE_DEVICE:
+      return scope_device;
     }
+   unreachable("invalid scope");
+}
+
+void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
+   Builder bld(ctx->program, ctx->block);
+
+   unsigned semantics = 0;
+   unsigned storage = 0;
+   sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
+   sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
+
+   unsigned nir_storage = nir_intrinsic_memory_modes(instr);
+   if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
+      storage |= storage_buffer | storage_image; //TODO: split this when NIR gets nir_var_mem_image
+   if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && (nir_storage & nir_var_mem_shared))
+      storage |= storage_shared;
+   if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL && (nir_storage & nir_var_shader_out))
+      storage |= storage_shared;
+
+   unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
+   if (nir_semantics & NIR_MEMORY_ACQUIRE)
+      semantics |= semantic_acquire | semantic_release;
+   if (nir_semantics & NIR_MEMORY_RELEASE)
+      semantics |= semantic_acquire | semantic_release;
+
+   assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
+
+   bld.barrier(aco_opcode::p_barrier,
+               memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
+               exec_scope);
  }
  
  void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read()
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-   assert(instr->dest.ssa.bit_size >= 32 && "Bitsize not supported in load_shared.");
     Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Builder bld(ctx->program, ctx->block);
  
@@ -6830,7 +6680,6 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp data = get_ssa_temp(ctx, instr->src[0].ssa);
     Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   assert(elem_size_bytes >= 4 && "Only 32bit & 64bit store_shared currently supported.");
  
     unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes;
     store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align);
@@ -6839,7 +6688,8 @@ void visit_store_shared(isel_context *ctx, nir_intrinsic_instr *instr)
  void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     unsigned offset = nir_intrinsic_base(instr);
-   Operand m = load_lds_size_m0(ctx);
+   Builder bld(ctx->program, ctx->block);
+   Operand m = load_lds_size_m0(bld);
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
  
@@ -6898,7 +6748,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
           op32 = aco_opcode::ds_write_b32;
           op64 = aco_opcode::ds_write_b64;
           op32_rtn = aco_opcode::ds_wrxchg_rtn_b32;
-         op64_rtn = aco_opcode::ds_wrxchg2_rtn_b64;
+         op64_rtn = aco_opcode::ds_wrxchg_rtn_b64;
           break;
        case nir_intrinsic_shared_atomic_comp_swap:
           op32 = aco_opcode::ds_cmpst_b32;
@@ -6907,6 +6757,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
           op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
           num_operands = 4;
           break;
+      case nir_intrinsic_shared_atomic_fadd:
+         op32 = aco_opcode::ds_add_f32;
+         op32_rtn = aco_opcode::ds_add_rtn_f32;
+         op64 = aco_opcode::num_opcodes;
+         op64_rtn = aco_opcode::num_opcodes;
+         break;
        default:
           unreachable("Unhandled shared atomic intrinsic");
     }
@@ -6932,7 +6788,6 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
  
     if (offset > 65535) {
-      Builder bld(ctx->program, ctx->block);
        address = bld.vadd32(bld.def(v1), Operand(offset), address);
        offset = 0;
     }
@@ -6947,6 +6802,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     ds->offset0 = offset;
     if (return_previous)
        ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
     ctx->block->instructions.emplace_back(std::move(ds));
  }
  
@@ -6958,7 +6814,7 @@ Temp get_scratch_resource(isel_context *ctx)
        scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
  
     uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
  
     if (ctx->program->chip_class >= GFX10) {
        rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
@@ -6969,130 +6825,49 @@ Temp get_scratch_resource(isel_context *ctx)
                     S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
     }
  
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
     if (ctx->program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
  
     return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
  }
  
  void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
-   assert(instr->dest.ssa.bit_size == 32 || instr->dest.ssa.bit_size == 64);
     Builder bld(ctx->program, ctx->block);
     Temp rsrc = get_scratch_resource(ctx);
     Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
  
-   aco_opcode op;
-   switch (dst.size()) {
-      case 1:
-         op = aco_opcode::buffer_load_dword;
-         break;
-      case 2:
-         op = aco_opcode::buffer_load_dwordx2;
-         break;
-      case 3:
-         op = aco_opcode::buffer_load_dwordx3;
-         break;
-      case 4:
-         op = aco_opcode::buffer_load_dwordx4;
-         break;
-      case 6:
-      case 8: {
-         std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
-         Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
-                                bld.def(v4), rsrc, offset,
-                                ctx->program->scratch_offset, 0, true);
-         Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
-                                                  aco_opcode::buffer_load_dwordx4,
-                                dst.size() == 6 ? bld.def(v2) : bld.def(v4),
-                                rsrc, offset, ctx->program->scratch_offset, 16, true);
-         emit_split_vector(ctx, lower, 2);
-         elems[0] = emit_extract_vector(ctx, lower, 0, v2);
-         elems[1] = emit_extract_vector(ctx, lower, 1, v2);
-         if (dst.size() == 8) {
-            emit_split_vector(ctx, upper, 2);
-            elems[2] = emit_extract_vector(ctx, upper, 0, v2);
-            elems[3] = emit_extract_vector(ctx, upper, 1, v2);
-         } else {
-            elems[2] = upper;
-         }
-
-         aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector,
-                                                                         Format::PSEUDO, dst.size() / 2, 1)};
-         for (unsigned i = 0; i < dst.size() / 2; i++)
-            vec->operands[i] = Operand(elems[i]);
-         vec->definitions[0] = Definition(dst);
-         bld.insert(std::move(vec));
-         ctx->allocated_vec.emplace(dst.id(), elems);
-         return;
-      }
-      default:
-         unreachable("Wrong dst size for nir_intrinsic_load_scratch");
-   }
-
-   bld.mubuf(op, Definition(dst), rsrc, offset, ctx->program->scratch_offset, 0, true);
-   emit_split_vector(ctx, dst, instr->num_components);
+   LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components,
+                        instr->dest.ssa.bit_size / 8u, rsrc};
+   info.align_mul = nir_intrinsic_align_mul(instr);
+   info.align_offset = nir_intrinsic_align_offset(instr);
+   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
+   info.sync = memory_sync_info(storage_scratch, semantic_private);
+   info.soffset = ctx->program->scratch_offset;
+   emit_scratch_load(ctx, bld, &info);
  }
  
  void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
-   assert(instr->src[0].ssa->bit_size == 32 || instr->src[0].ssa->bit_size == 64);
     Builder bld(ctx->program, ctx->block);
     Temp rsrc = get_scratch_resource(ctx);
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
  
     unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
-   unsigned writemask = nir_intrinsic_write_mask(instr);
-
-   while (writemask) {
-      int start, count;
-      u_bit_scan_consecutive_range(&writemask, &start, &count);
-      int num_bytes = count * elem_size_bytes;
-
-      if (num_bytes > 16) {
-         assert(elem_size_bytes == 8);
-         writemask |= (((count - 2) << 1) - 1) << (start + 2);
-         count = 2;
-         num_bytes = 16;
-      }
-
-      // TODO: check alignment of sub-dword stores
-      // TODO: split 3 bytes. there is no store instruction for that
-
-      Temp write_data;
-      if (count != instr->num_components) {
-         aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, count, 1)};
-         for (int i = 0; i < count; i++) {
-            Temp elem = emit_extract_vector(ctx, data, start + i, RegClass(RegType::vgpr, elem_size_bytes / 4));
-            vec->operands[i] = Operand(elem);
-         }
-         write_data = bld.tmp(RegClass(RegType::vgpr, count * elem_size_bytes / 4));
-         vec->definitions[0] = Definition(write_data);
-         ctx->block->instructions.emplace_back(std::move(vec));
-      } else {
-         write_data = data;
-      }
+   unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
  
-      aco_opcode op;
-      switch (num_bytes) {
-         case 4:
-            op = aco_opcode::buffer_store_dword;
-            break;
-         case 8:
-            op = aco_opcode::buffer_store_dwordx2;
-            break;
-         case 12:
-            op = aco_opcode::buffer_store_dwordx3;
-            break;
-         case 16:
-            op = aco_opcode::buffer_store_dwordx4;
-            break;
-         default:
-            unreachable("Invalid data size for nir_intrinsic_store_scratch.");
-      }
+   unsigned write_count = 0;
+   Temp write_datas[32];
+   unsigned offsets[32];
+   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
+   split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
+                      swizzle_component_size, &write_count, write_datas, offsets);
  
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
+   for (unsigned i = 0; i < write_count; i++) {
+      aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
+      Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
     }
  }
  
@@ -7206,8 +6981,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
              mtbuf->offset = const_offset;
              mtbuf->glc = true;
              mtbuf->slc = true;
-            mtbuf->barrier = barrier_gs_data;
-            mtbuf->can_reorder = true;
+            mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
              bld.insert(std::move(mtbuf));
           }
  
@@ -7359,9 +7133,7 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s
     } else if (src.regClass() == s2) {
        bld.sop1(aco_opcode::s_mov_b64, dst, src);
     } else {
-      fprintf(stderr, "Unimplemented NIR instr bit size: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unimplemented NIR instr bit size");
     }
  }
  
@@ -7399,10 +7171,11 @@ void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
     }
  
     /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
-   Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
-   Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
-   tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
-   tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
+   aco_opcode mad = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
+   Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
+   Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
+   tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
+   tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
     Temp wqm1 = bld.tmp(v1);
     emit_wqm(ctx, tmp1, wqm1, true);
     Temp wqm2 = bld.tmp(v1);
@@ -7473,6 +7246,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
        nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
        Temp private_segment_buffer = ctx->program->private_segment_buffer;
+      //TODO: bounds checking?
        if (addr.type() == RegType::sgpr) {
           Operand offset;
           if (const_addr) {
@@ -7531,8 +7305,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           load->glc = false;
           load->dlc = false;
           load->disable_wqm = false;
-         load->barrier = barrier_none;
-         load->can_reorder = true;
           ctx->block->instructions.emplace_back(std::move(load));
        }
  
@@ -7645,6 +7417,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_shared_atomic_xor:
     case nir_intrinsic_shared_atomic_exchange:
     case nir_intrinsic_shared_atomic_comp_swap:
+   case nir_intrinsic_shared_atomic_fadd:
        visit_shared_atomic(ctx, instr);
        break;
     case nir_intrinsic_image_deref_load:
@@ -7713,27 +7486,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_get_buffer_size:
        visit_get_buffer_size(ctx, instr);
        break;
-   case nir_intrinsic_control_barrier: {
-      if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
-         /* GFX6 only (thanks to a hw bug workaround):
-          * The real barrier instruction isn’t needed, because an entire patch
-          * always fits into a single wave.
-          */
-         break;
-      }
-
-      if (ctx->program->workgroup_size > ctx->program->wave_size)
-         bld.sopp(aco_opcode::s_barrier);
-
-      break;
-   }
-   case nir_intrinsic_memory_barrier_tcs_patch:
-   case nir_intrinsic_group_memory_barrier:
-   case nir_intrinsic_memory_barrier:
-   case nir_intrinsic_memory_barrier_buffer:
-   case nir_intrinsic_memory_barrier_image:
-   case nir_intrinsic_memory_barrier_shared:
-      emit_memory_barrier(ctx, instr);
+   case nir_intrinsic_scoped_barrier:
+      emit_scoped_barrier(ctx, instr);
        break;
     case nir_intrinsic_load_num_work_groups: {
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -7810,9 +7564,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
           bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        if (dst.size() != bld.lm.size()) {
           /* Wave32 with ballot size set to 64 */
@@ -7824,14 +7576,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_shuffle:
     case nir_intrinsic_read_invocation: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->src[0].ssa->index]) {
+      if (!nir_src_is_divergent(instr->src[0])) {
           emit_uniform_subgroup(ctx, instr, src);
        } else {
           Temp tid = get_ssa_temp(ctx, instr->src[1].ssa);
-         if (instr->intrinsic == nir_intrinsic_read_invocation || !ctx->divergent_vals[instr->src[1].ssa->index])
+         if (instr->intrinsic == nir_intrinsic_read_invocation || !nir_src_is_divergent(instr->src[1]))
              tid = bld.as_uniform(tid);
           Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-         if (src.regClass() == v1) {
+         if (src.regClass() == v1b || src.regClass() == v2b) {
+            Temp tmp = bld.tmp(v1);
+            tmp = emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), tmp);
+            if (dst.type() == RegType::vgpr)
+               bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(src.regClass() == v1b ? v3b : v2b), tmp);
+            else
+               bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
+         } else if (src.regClass() == v1) {
              emit_wqm(ctx, emit_bpermute(ctx, bld, tid, src), dst);
           } else if (src.regClass() == v2) {
              Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
@@ -7857,9 +7616,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
              tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
              emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
           } else {
-            fprintf(stderr, "Unimplemented NIR instr bit size: ");
-            nir_print_instr(&instr->instr, stderr);
-            fprintf(stderr, "\n");
+            isel_err(&instr->instr, "Unimplemented NIR instr bit size");
           }
        }
        break;
@@ -7876,7 +7633,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_read_first_invocation: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
-      if (src.regClass() == v1) {
+      if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) {
           emit_wqm(ctx,
                    bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src),
                    dst);
@@ -7897,9 +7654,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        } else if (src.regClass() == s2) {
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -7934,7 +7689,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           nir_intrinsic_cluster_size(instr) : 0;
        cluster_size = util_next_power_of_two(MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size));
  
-      if (!ctx->divergent_vals[instr->src[0].ssa->index] && (op == nir_op_ior || op == nir_op_iand)) {
+      if (!nir_src_is_divergent(instr->src[0]) && (op == nir_op_ior || op == nir_op_iand)) {
           emit_uniform_subgroup(ctx, instr, src);
        } else if (instr->dest.ssa.bit_size == 1) {
           if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin)
@@ -7961,27 +7716,31 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        } else if (cluster_size == 1) {
           bld.copy(Definition(dst), src);
        } else {
-         src = as_vgpr(ctx, src);
+         unsigned bit_size = instr->src[0].ssa->bit_size;
+
+         src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8));
  
           ReduceOp reduce_op;
           switch (op) {
-         #define CASE(name) case nir_op_##name: reduce_op = (src.regClass() == v1) ? name##32 : name##64; break;
-            CASE(iadd)
-            CASE(imul)
-            CASE(fadd)
-            CASE(fmul)
-            CASE(imin)
-            CASE(umin)
-            CASE(fmin)
-            CASE(imax)
-            CASE(umax)
-            CASE(fmax)
-            CASE(iand)
-            CASE(ior)
-            CASE(ixor)
+         #define CASEI(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : (bit_size == 8) ? name##8 : name##64; break;
+         #define CASEF(name) case nir_op_##name: reduce_op = (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; break;
+            CASEI(iadd)
+            CASEI(imul)
+            CASEI(imin)
+            CASEI(umin)
+            CASEI(imax)
+            CASEI(umax)
+            CASEI(iand)
+            CASEI(ior)
+            CASEI(ixor)
+            CASEF(fadd)
+            CASEF(fmul)
+            CASEF(fmin)
+            CASEF(fmax)
              default:
                 unreachable("unknown reduction op");
-         #undef CASE
+         #undef CASEI
+         #undef CASEF
           }
  
           aco_opcode aco_op;
@@ -8017,7 +7776,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
     case nir_intrinsic_quad_broadcast: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
           emit_uniform_subgroup(ctx, instr, src);
        } else {
           Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -8034,6 +7793,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
                       bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp,
                                bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))));
              emit_wqm(ctx, tmp, dst);
+         } else if (instr->dest.ssa.bit_size == 8) {
+            Temp tmp = bld.tmp(v1);
+            if (ctx->program->chip_class >= GFX8)
+               emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+            else
+               emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+            bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
+         } else if (instr->dest.ssa.bit_size == 16) {
+            Temp tmp = bld.tmp(v1);
+            if (ctx->program->chip_class >= GFX8)
+               emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+            else
+               emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl), tmp);
+            bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
           } else if (instr->dest.ssa.bit_size == 32) {
              if (ctx->program->chip_class >= GFX8)
                 emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), dst);
@@ -8052,9 +7825,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
              bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
              emit_split_vector(ctx, dst, 2);
           } else {
-            fprintf(stderr, "Unimplemented NIR instr bit size: ");
-            nir_print_instr(&instr->instr, stderr);
-            fprintf(stderr, "\n");
+            isel_err(&instr->instr, "Unimplemented NIR instr bit size");
           }
        }
        break;
@@ -8064,7 +7835,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_quad_swap_diagonal:
     case nir_intrinsic_quad_swizzle_amd: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
           emit_uniform_subgroup(ctx, instr, src);
           break;
        }
@@ -8098,6 +7869,20 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
              src = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl);
           Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
           emit_wqm(ctx, tmp, dst);
+      } else if (instr->dest.ssa.bit_size == 8) {
+         Temp tmp = bld.tmp(v1);
+         if (ctx->program->chip_class >= GFX8)
+            emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         else
+            emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v3b), tmp);
+      } else if (instr->dest.ssa.bit_size == 16) {
+         Temp tmp = bld.tmp(v1);
+         if (ctx->program->chip_class >= GFX8)
+            emit_wqm(ctx, bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         else
+            emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, dpp_ctrl), tmp);
+         bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(v2b), tmp);
        } else if (instr->dest.ssa.bit_size == 32) {
           Temp tmp;
           if (ctx->program->chip_class >= GFX8)
@@ -8118,35 +7903,41 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_intrinsic_masked_swizzle_amd: {
        Temp src = get_ssa_temp(ctx, instr->src[0].ssa);
-      if (!ctx->divergent_vals[instr->dest.ssa.index]) {
+      if (!nir_dest_is_divergent(instr->dest)) {
           emit_uniform_subgroup(ctx, instr, src);
           break;
        }
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
        uint32_t mask = nir_intrinsic_swizzle_mask(instr);
-      if (dst.regClass() == v1) {
-         emit_wqm(ctx,
-                  bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
-                  dst);
+      if (instr->dest.ssa.bit_size == 1) {
+         assert(src.regClass() == bld.lm);
+         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
+         src = emit_masked_swizzle(ctx, bld, src, mask);
+         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
+         emit_wqm(ctx, tmp, dst);
+      } else if (dst.regClass() == v1b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v2b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v1) {
+         emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst);
        } else if (dst.regClass() == v2) {
           Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
-         lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
-         hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
+         lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
+         hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -8168,9 +7959,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -8223,10 +8012,21 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
                 get_ssa_temp(ctx, &instr->dest.ssa));
        break;
     }
-   case nir_intrinsic_shader_clock:
-      bld.smem(aco_opcode::s_memtime, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
-      emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
+   case nir_intrinsic_shader_clock: {
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && ctx->options->chip_class >= GFX10_3) {
+         /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
+         Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand(0u));
+      } else {
+         aco_opcode opcode =
+            nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
+               aco_opcode::s_memrealtime : aco_opcode::s_memtime;
+         bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
+      }
+      emit_split_vector(ctx, dst, 2);
        break;
+   }
     case nir_intrinsic_load_vertex_id_zero_base: {
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
        bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id));
@@ -8310,9 +8110,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        break;
     }
     default:
-      fprintf(stderr, "Unimplemented intrinsic instr: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unimplemented intrinsic instr");
        abort();
  
        break;
@@ -8438,6 +8236,8 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
  {
     Builder bld(ctx->program, ctx->block);
     Temp ma, tc, sc, id;
+   aco_opcode madak = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
+   aco_opcode madmk = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
  
     if (is_array) {
        coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
@@ -8458,11 +8258,11 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
  
     sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
     if (!is_deriv)
-      sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
+      sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
  
     tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
     if (!is_deriv)
-      tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
+      tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
  
     id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
  
@@ -8493,7 +8293,7 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
     }
  
     if (is_array)
-      id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
+      id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
     coords.resize(3);
     coords[0] = sc;
     coords[1] = tc;
@@ -8518,9 +8318,11 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
     bool has_bias = false, has_lod = false, level_zero = false, has_compare = false,
-        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false;
+        has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, has_sample_index = false,
+        has_clamped_lod = false;
     Temp resource, sampler, fmask_ptr, bias = Temp(), compare = Temp(), sample_index = Temp(),
-        lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp();
+        lod = Temp(), offset = Temp(), ddx = Temp(), ddy = Temp(),
+        clamped_lod = Temp();
     std::vector<Temp> coords;
     std::vector<Temp> derivs;
     nir_const_value *sample_index_cv = NULL;
@@ -8542,10 +8344,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           break;
        }
        case nir_tex_src_bias:
-         if (instr->op == nir_texop_txb) {
-            bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
-            has_bias = true;
-         }
+         bias = get_ssa_temp(ctx, instr->src[i].src.ssa);
+         has_bias = true;
           break;
        case nir_tex_src_lod: {
           nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
@@ -8558,6 +8358,10 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           }
           break;
        }
+      case nir_tex_src_min_lod:
+         clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa);
+         has_clamped_lod = true;
+         break;
        case nir_tex_src_comparator:
           if (instr->is_shadow) {
              compare = get_ssa_temp(ctx, instr->src[i].src.ssa);
@@ -8598,10 +8402,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(16u | 4u<<16));
        Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand(1u), samples_log2);
        Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, Operand(28u | 4u<<16 /* offset=28, width=4 */));
-      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
  
+      Operand default_sample = Operand(1u);
+      if (ctx->options->robust_buffer_access) {
+         /* Extract the second dword of the descriptor, if it's
+         * all zero, then it's a null descriptor.
+         */
+         Temp dword1 = emit_extract_vector(ctx, resource, 1, s1);
+         Temp is_non_null_descriptor = bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand(0u));
+         default_sample = Operand(is_non_null_descriptor);
+      }
+
+      Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand(14u));
        bld.sop2(aco_opcode::s_cselect_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)),
-               samples, Operand(1u), bld.scc(is_msaa));
+               samples, default_sample, bld.scc(is_msaa));
        return;
     }
  
@@ -8790,7 +8604,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->da = da;
        tex->definitions[0] = Definition(tmp_dst);
        tex->dim = dim;
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
  
        if (div_by_6) {
@@ -8823,7 +8636,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->da = da;
        Temp size = bld.tmp(v2);
        tex->definitions[0] = Definition(size);
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
        emit_split_vector(ctx, size, size.size());
  
@@ -8925,7 +8737,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        mubuf->operands[2] = Operand((uint32_t) 0);
        mubuf->definitions[0] = Definition(tmp_dst);
        mubuf->idxen = true;
-      mubuf->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
  
        expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
@@ -8948,6 +8759,8 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        args.emplace_back(sample_index);
     if (has_lod)
        args.emplace_back(lod);
+   if (has_clamped_lod)
+      args.emplace_back(clamped_lod);
  
     Temp arg = bld.tmp(RegClass(RegType::vgpr, args.size()));
     aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, args.size(), 1)};
@@ -8972,7 +8785,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->unrm = true;
        tex->da = da;
        tex->definitions[0] = Definition(tmp_dst);
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
  
        if (instr->op == nir_texop_samples_identical) {
@@ -8992,7 +8804,21 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
     // TODO: would be better to do this by adding offsets, but needs the opcodes ordered.
     aco_opcode opcode = aco_opcode::image_sample;
     if (has_offset) { /* image_sample_*_o */
-      if (has_compare) {
+      if (has_clamped_lod) {
+         if (has_compare) {
+            opcode = aco_opcode::image_sample_c_cl_o;
+            if (has_derivs)
+               opcode = aco_opcode::image_sample_c_d_cl_o;
+            if (has_bias)
+               opcode = aco_opcode::image_sample_c_b_cl_o;
+         } else {
+            opcode = aco_opcode::image_sample_cl_o;
+            if (has_derivs)
+               opcode = aco_opcode::image_sample_d_cl_o;
+            if (has_bias)
+               opcode = aco_opcode::image_sample_b_cl_o;
+         }
+      } else if (has_compare) {
           opcode = aco_opcode::image_sample_c_o;
           if (has_derivs)
              opcode = aco_opcode::image_sample_c_d_o;
@@ -9013,6 +8839,20 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
           if (has_lod)
              opcode = aco_opcode::image_sample_l_o;
        }
+   } else if (has_clamped_lod) { /* image_sample_*_cl */
+      if (has_compare) {
+         opcode = aco_opcode::image_sample_c_cl;
+         if (has_derivs)
+            opcode = aco_opcode::image_sample_c_d_cl;
+         if (has_bias)
+            opcode = aco_opcode::image_sample_c_b_cl;
+      } else {
+         opcode = aco_opcode::image_sample_cl;
+         if (has_derivs)
+            opcode = aco_opcode::image_sample_d_cl;
+         if (has_bias)
+            opcode = aco_opcode::image_sample_b_cl;
+      }
     } else { /* no offset */
        if (has_compare) {
           opcode = aco_opcode::image_sample_c;
@@ -9038,14 +8878,34 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
     }
  
     if (instr->op == nir_texop_tg4) {
-      if (has_offset) {
-         opcode = aco_opcode::image_gather4_lz_o;
-         if (has_compare)
+      if (has_offset) { /* image_gather4_*_o */
+         if (has_compare) {
              opcode = aco_opcode::image_gather4_c_lz_o;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_c_l_o;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_c_b_o;
+         } else {
+            opcode = aco_opcode::image_gather4_lz_o;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_l_o;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_b_o;
+         }
        } else {
-         opcode = aco_opcode::image_gather4_lz;
-         if (has_compare)
+         if (has_compare) {
              opcode = aco_opcode::image_gather4_c_lz;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_c_l;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_c_b;
+         } else {
+            opcode = aco_opcode::image_gather4_lz;
+            if (has_lod)
+               opcode = aco_opcode::image_gather4_l;
+            if (has_bias)
+               opcode = aco_opcode::image_gather4_b;
+         }
        }
     } else if (instr->op == nir_texop_lod) {
        opcode = aco_opcode::image_get_lod;
@@ -9068,7 +8928,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
     tex->dmask = dmask;
     tex->da = da;
     tex->definitions[0] = Definition(tmp_dst);
-   tex->can_reorder = true;
     ctx->block->instructions.emplace_back(std::move(tex));
  
     if (tg4_integer_cube_workaround) {
@@ -9096,13 +8955,19 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
  }
  
  
-Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa)
+Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
  {
     Temp tmp = get_ssa_temp(ctx, ssa);
-   if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
-      return Operand(tmp.regClass());
-   else
+   if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+      return Operand(rc);
+   } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
+      if (ctx->program->wave_size == 64)
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
+      else
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
+   } else {
        return Operand(tmp);
+   }
  }
  
  void visit_phi(isel_context *ctx, nir_phi_instr *instr)
@@ -9111,7 +8976,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
     assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask);
  
-   bool logical = !dst.is_linear() || ctx->divergent_vals[instr->dest.ssa.index];
+   bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest);
     logical |= ctx->block->kind & block_kind_merge;
     aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi;
  
@@ -9145,7 +9010,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
        if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
           continue;
        cur_pred_idx++;
-      Operand op = get_phi_operand(ctx, src.second);
+      Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
        operands[num_operands++] = op;
        num_defined += !op.isUndefined();
     }
@@ -9312,9 +9177,7 @@ void visit_jump(isel_context *ctx, nir_jump_instr *instr)
        }
        break;
     default:
-      fprintf(stderr, "Unknown NIR jump instr: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unknown NIR jump instr");
        abort();
     }
  
@@ -9372,9 +9235,7 @@ void visit_block(isel_context *ctx, nir_block *block)
           visit_jump(ctx, nir_instr_as_jump(instr));
           break;
        default:
-         fprintf(stderr, "Unknown NIR instr type: ");
-         nir_print_instr(instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(instr, "Unknown NIR instr type");
           //abort();
        }
     }
@@ -9805,7 +9666,7 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
     aco_ptr<Pseudo_branch_instruction> branch;
     if_context ic;
  
-   if (!ctx->divergent_vals[if_stmt->condition.ssa->index]) { /* uniform condition */
+   if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */
        /**
         * Uniform conditionals are represented in the following way*) :
         *
@@ -9833,8 +9694,6 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
        visit_cf_list(ctx, &if_stmt->else_list);
  
        end_uniform_if(ctx, &ic);
-
-      return !ctx->cf_info.has_branch;
     } else { /* non-uniform condition */
        /**
         * To maintain a logical and linear CFG without critical edges,
@@ -9868,9 +9727,9 @@ static bool visit_if(isel_context *ctx, nir_if *if_stmt)
        visit_cf_list(ctx, &if_stmt->else_list);
  
        end_divergent_if(ctx, &ic);
-
-      return true;
     }
+
+   return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty();
  }
  
  static bool visit_cf_list(isel_context *ctx,
@@ -9932,10 +9791,10 @@ static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex
        else
           exp->operands[i] = Operand(v1);
     }
-   /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+   /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
      * Setting valid_mask=1 prevents it and has no other effect.
      */
-   exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
+   exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
     exp->done = false;
     exp->compressed = false;
     if (is_pos)
@@ -9977,7 +9836,7 @@ static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
           exp->enabled_mask |= 0x4;
        }
     }
-   exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
+   exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
     exp->done = false;
     exp->compressed = false;
     exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
@@ -10151,6 +10010,7 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
  
     bool is_int8 = (ctx->options->key.fs.is_int8 >> slot) & 1;
     bool is_int10 = (ctx->options->key.fs.is_int10 >> slot) & 1;
+   bool is_16bit = values[0].regClass() == v2b;
  
     switch (col_format)
     {
@@ -10181,16 +10041,38 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
     case V_028714_SPI_SHADER_FP16_ABGR:
        enabled_channels = 0x5;
        compr_op = aco_opcode::v_cvt_pkrtz_f16_f32;
+      if (is_16bit) {
+         if (ctx->options->chip_class >= GFX9) {
+            /* Pack the FP16 values together instead of converting them to
+             * FP32 and back to FP16.
+             * TODO: use p_create_vector and let the compiler optimizes.
+             */
+            compr_op = aco_opcode::v_pack_b32_f16;
+         } else {
+            for (unsigned i = 0; i < 4; i++) {
+               if ((write_mask >> i) & 1)
+                  values[i] = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), values[i]);
+            }
+         }
+      }
        break;
  
     case V_028714_SPI_SHADER_UNORM16_ABGR:
        enabled_channels = 0x5;
-      compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+      if (is_16bit && ctx->options->chip_class >= GFX9) {
+         compr_op = aco_opcode::v_cvt_pknorm_u16_f16;
+      } else {
+         compr_op = aco_opcode::v_cvt_pknorm_u16_f32;
+      }
        break;
  
     case V_028714_SPI_SHADER_SNORM16_ABGR:
        enabled_channels = 0x5;
-      compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+      if (is_16bit && ctx->options->chip_class >= GFX9) {
+         compr_op = aco_opcode::v_cvt_pknorm_i16_f16;
+      } else {
+         compr_op = aco_opcode::v_cvt_pknorm_i16_f32;
+      }
        break;
  
     case V_028714_SPI_SHADER_UINT16_ABGR: {
@@ -10208,6 +10090,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
                                      values[i]);
              }
           }
+      } else if (is_16bit) {
+         for (unsigned i = 0; i < 4; i++) {
+            if ((write_mask >> i) & 1) {
+               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false);
+               values[i] = Operand(tmp);
+            }
+         }
        }
        break;
     }
@@ -10232,6 +10121,13 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
                                      values[i]);
              }
           }
+      } else if (is_16bit) {
+         for (unsigned i = 0; i < 4; i++) {
+            if ((write_mask >> i) & 1) {
+               Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true);
+               values[i] = Operand(tmp);
+            }
+         }
        }
        break;
  
@@ -10246,6 +10142,26 @@ static bool export_fs_mrt_color(isel_context *ctx, int slot)
     if (target == V_008DFC_SQ_EXP_NULL)
        return false;
  
+   /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */
+   if (ctx->options->enable_mrt_output_nan_fixup &&
+       !is_16bit &&
+       (col_format == V_028714_SPI_SHADER_32_R ||
+        col_format == V_028714_SPI_SHADER_32_GR ||
+        col_format == V_028714_SPI_SHADER_32_AR ||
+        col_format == V_028714_SPI_SHADER_32_ABGR ||
+        col_format == V_028714_SPI_SHADER_FP16_ABGR)) {
+      for (int i = 0; i < 4; i++) {
+         if (!(write_mask & (1 << i)))
+            continue;
+
+         Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32,
+                               bld.hint_vcc(bld.def(bld.lm)), values[i],
+                               bld.copy(bld.def(v1), Operand(3u)));
+         values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i],
+                              bld.copy(bld.def(v1), Operand(0u)), isnan);
+      }
+   }
+
     if ((bool) compr_op) {
        for (int i = 0; i < 2; i++) {
           /* check if at least one of the values to be compressed is enabled */
@@ -10290,6 +10206,13 @@ static void create_fs_exports(isel_context *ctx)
        create_null_export(ctx);
  }
  
+static void create_workgroup_barrier(Builder& bld)
+{
+   bld.barrier(aco_opcode::p_barrier,
+               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
+               scope_workgroup);
+}
+
  static void write_tcs_tess_factors(isel_context *ctx)
  {
     unsigned outer_comps;
@@ -10314,9 +10237,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
  
     Builder bld(ctx->program, ctx->block);
  
-   bld.barrier(aco_opcode::p_memory_barrier_shared);
-   if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
-      bld.sopp(aco_opcode::s_barrier);
+   create_workgroup_barrier(bld);
  
     Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
     Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
@@ -10366,8 +10287,8 @@ static void write_tcs_tess_factors(isel_context *ctx)
        Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
        bld.mubuf(aco_opcode::buffer_store_dword,
                  /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
-                /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
-                /* disable_wqm */ false, /* glc */ true);
+                /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+                /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
        tf_const_offset += 4;
  
        begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
@@ -10377,7 +10298,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
  
     assert(stride == 2 || stride == 4 || stride == 6);
     Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
-   store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
+   store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, memory_sync_info());
  
     /* Store to offchip for TES to read - only if TES reads them */
     if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
@@ -10385,11 +10306,11 @@ static void write_tcs_tess_factors(isel_context *ctx)
        Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
  
        std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
-      store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
+      store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, memory_sync_info(storage_vmem_output));
  
        if (likely(inner_comps)) {
           std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
-         store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
+         store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, memory_sync_info(storage_vmem_output));
        }
     }
  
@@ -10475,7 +10396,6 @@ static void emit_stream_output(isel_context *ctx,
        store->glc = true;
        store->dlc = false;
        store->slc = true;
-      store->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(store));
     }
  }
@@ -10657,7 +10577,8 @@ void setup_fp_mode(isel_context *ctx, nir_shader *shader)
        float_controls & (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 |
                          FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64);
  
-   /* default to preserving fp16 and fp64 denorms, since it's free */
+   /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and
+    * the precision seems needed for Wolfenstein: Youngblood to render correctly */
     if (program->next_fp_mode.must_flush_denorms16_64)
        program->next_fp_mode.denorm16_64 = 0;
     else
@@ -10894,8 +10815,7 @@ void ngg_emit_nogs_output(isel_context *ctx)
  
        if (ctx->stage == ngg_vertex_gs) {
           /* Wait for GS threads to store primitive ID in LDS. */
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
  
           /* Calculate LDS address where the GS threads stored the primitive ID. */
           Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
@@ -10979,8 +10899,7 @@ void select_program(Program *program,
        if (i) {
           Builder bld(ctx.program, ctx.block);
  
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
  
           if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
              ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
@@ -11003,7 +10922,8 @@ void select_program(Program *program,
           ngg_emit_nogs_output(&ctx);
        } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
           Builder bld(ctx.program, ctx.block);
-         bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_vmem_output, semantic_release, scope_device));
           bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
        } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
           write_tcs_tess_factors(&ctx);
@@ -11022,8 +10942,6 @@ void select_program(Program *program,
        if (ngg_no_gs && !ngg_early_prim_export(&ctx))
           ngg_emit_nogs_output(&ctx);
  
-      ralloc_free(ctx.divergent_vals);
-
        if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) {
           /* Outputs of the previous stage are inputs to the next stage */
           ctx.inputs = ctx.outputs;
@@ -11037,7 +10955,7 @@ void select_program(Program *program,
     ctx.block->kind |= block_kind_uniform;
     Builder bld(ctx.program, ctx.block);
     if (ctx.program->wb_smem_l1_on_end)
-      bld.smem(aco_opcode::s_dcache_wb, false);
+      bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
     bld.sopp(aco_opcode::s_endpgm);
  
     cleanup_cfg(program);
@@ -11049,16 +10967,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
  {
     isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
  
-   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
-   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
-   program->next_fp_mode.must_flush_denorms32 = false;
-   program->next_fp_mode.must_flush_denorms16_64 = false;
-   program->next_fp_mode.care_about_round32 = false;
-   program->next_fp_mode.care_about_round16_64 = false;
-   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
-   program->next_fp_mode.denorm32 = 0;
-   program->next_fp_mode.round32 = fp_round_ne;
-   program->next_fp_mode.round16_64 = fp_round_ne;
     ctx.block->fp_mode = program->next_fp_mode;
  
     add_startpgm(&ctx);
@@ -11132,8 +11040,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
              mubuf->glc = true;
              mubuf->slc = true;
              mubuf->dlc = args->options->chip_class >= GFX10;
-            mubuf->barrier = barrier_none;
-            mubuf->can_reorder = true;
  
              ctx.outputs.mask[i] |= 1 << j;
              ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();
@@ -11198,4 +11104,64 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
  
     cleanup_cfg(program);
  }
+
+void select_trap_handler_shader(Program *program, struct nir_shader *shader,
+                                ac_shader_config* config,
+                                struct radv_shader_args *args)
+{
+   assert(args->options->chip_class == GFX8);
+
+   init_program(program, compute_cs, args->shader_info,
+                args->options->chip_class, args->options->family, config);
+
+   isel_context ctx = {};
+   ctx.program = program;
+   ctx.args = args;
+   ctx.options = args->options;
+   ctx.stage = program->stage;
+
+   ctx.block = ctx.program->create_and_insert_block();
+   ctx.block->loop_nest_depth = 0;
+   ctx.block->kind = block_kind_top_level;
+
+   program->workgroup_size = 1; /* XXX */
+
+   add_startpgm(&ctx);
+   append_logical_start(ctx.block);
+
+   Builder bld(ctx.program, ctx.block);
+
+   /* Load the buffer descriptor from TMA. */
+   bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4),
+            Operand(PhysReg{tma}, s2), Operand(0u));
+
+   /* Store TTMP0-TTMP1. */
+   bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4),
+            Operand(0u), Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true);
+
+   uint32_t hw_regs_idx[] = {
+      2, /* HW_REG_STATUS */
+      3, /* HW_REG_TRAP_STS */
+      4, /* HW_REG_HW_ID */
+      7, /* HW_REG_IB_STS */
+   };
+
+   /* Store some hardware registers. */
+   for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) {
+      /* "((size - 1) << 11) | register" */
+      bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1),
+               ((20 - 1) << 11) | hw_regs_idx[i]);
+
+      bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4),
+               Operand(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true);
+   }
+
+   program->config->float_mode = program->blocks[0].fp_mode.val;
+
+   append_logical_end(ctx.block);
+   ctx.block->kind |= block_kind_uniform;
+   bld.sopp(aco_opcode::s_endpgm);
+
+   cleanup_cfg(program);
+}
  }