aco: rework the way various compilation/validation errors are reported

[mesa.git] / src / amd / compiler / aco_instruction_selection.cpp
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp

index 7f4da8770680bc747a9507c02dc34fbc11c504bf..07c61a33d81cf65f3cf2ef4fcabdb7f3170a853f 100644 (file)
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -38,6 +38,23 @@
  namespace aco {
  namespace {
  
+#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__)
+
+static void _isel_err(isel_context *ctx, const char *file, unsigned line,
+                      const nir_instr *instr, const char *msg)
+{
+   char *out;
+   size_t outsize;
+   FILE *memf = open_memstream(&out, &outsize);
+
+   fprintf(memf, "%s: ", msg);
+   nir_print_instr(instr, memf);
+   fclose(memf);
+
+   _aco_err(ctx->program, file, line, out);
+   free(out);
+}
+
  class loop_info_RAII {
     isel_context* ctx;
     unsigned header_idx_old;
@@ -136,8 +153,11 @@ Temp emit_mbcnt(isel_context *ctx, Definition dst,
  
     if (ctx->program->wave_size == 32) {
        return thread_id_lo;
+   } else if (ctx->program->chip_class <= GFX7) {
+      Temp thread_id_hi = bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      return thread_id_hi;
     } else {
-      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, dst, mask_hi, thread_id_lo);
+      Temp thread_id_hi = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, dst, mask_hi, thread_id_lo);
        return thread_id_hi;
     }
  }
@@ -204,6 +224,36 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data
     }
  }
  
+static Temp emit_masked_swizzle(isel_context *ctx, Builder &bld, Temp src, unsigned mask)
+{
+   if (ctx->options->chip_class >= GFX8) {
+      unsigned and_mask = mask & 0x1f;
+      unsigned or_mask = (mask >> 5) & 0x1f;
+      unsigned xor_mask = (mask >> 10) & 0x1f;
+
+      uint16_t dpp_ctrl = 0xffff;
+
+      // TODO: we could use DPP8 for some swizzles
+      if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) {
+         unsigned res[4] = {0, 1, 2, 3};
+         for (unsigned i = 0; i < 4; i++)
+            res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3;
+         dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) {
+         dpp_ctrl = dpp_row_rr(8);
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) {
+         dpp_ctrl = dpp_row_mirror;
+      } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) {
+         dpp_ctrl = dpp_row_half_mirror;
+      }
+
+      if (dpp_ctrl != 0xffff)
+         return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl);
+   }
+
+   return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false);
+}
+
  Temp as_vgpr(isel_context *ctx, Temp val)
  {
     if (val.type() == RegType::sgpr) {
@@ -439,7 +489,7 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        offset = Operand(0u);
     }
  
-   unsigned num_components = dst.bytes() / component_size;
+   unsigned num_components = vec.bytes() / component_size;
     if (vec.regClass() == dst.regClass()) {
        assert(offset.constantValue() == 0);
        bld.copy(Definition(dst), vec);
@@ -447,17 +497,18 @@ void byte_align_vector(isel_context *ctx, Temp vec, Operand offset, Temp dst, un
        return;
     }
  
-   emit_split_vector(ctx, vec, vec.bytes() / component_size);
+   emit_split_vector(ctx, vec, num_components);
     std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems;
     RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword();
  
     assert(offset.constantValue() % component_size == 0);
     unsigned skip = offset.constantValue() / component_size;
-   for (unsigned i = 0; i < num_components; i++)
-      elems[i] = emit_extract_vector(ctx, vec, i + skip, rc);
+   for (unsigned i = skip; i < num_components; i++)
+      elems[i - skip] = emit_extract_vector(ctx, vec, i, rc);
  
     /* if dst is vgpr - split the src and create a shrunk version according to the mask. */
     if (dst.type() == RegType::vgpr) {
+      num_components = dst.bytes() / component_size;
        aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)};
        for (unsigned i = 0; i < num_components; i++)
           create_vec->operands[i] = Operand(elems[i]);
@@ -503,6 +554,104 @@ Temp bool_to_scalar_condition(isel_context *ctx, Temp val, Temp dst = Temp(0, s1
     return emit_wqm(ctx, tmp, dst);
  }
  
+Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp())
+{
+   if (!dst.id()) {
+      if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
+         dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
+      else
+         dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
+   }
+
+   if (dst.bytes() == src.bytes() && dst_bits < src_bits)
+      return bld.copy(Definition(dst), src);
+   else if (dst.bytes() < src.bytes())
+      return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
+
+   Temp tmp = dst;
+   if (dst_bits == 64)
+      tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
+
+   if (tmp == src) {
+   } else if (src.regClass() == s1) {
+      if (is_signed)
+         bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
+      else
+         bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
+   } else if (ctx->options->chip_class >= GFX8) {
+      assert(src_bits != 8 || src.regClass() == v1b);
+      assert(src_bits != 16 || src.regClass() == v2b);
+      aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
+      sdwa->operands[0] = Operand(src);
+      sdwa->definitions[0] = Definition(tmp);
+      if (is_signed)
+         sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
+      else
+         sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
+      sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
+      bld.insert(std::move(sdwa));
+   } else {
+      assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
+      aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
+      bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
+   }
+
+   if (dst_bits == 64) {
+      if (is_signed && dst.regClass() == s2) {
+         Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
+      } else if (is_signed && dst.regClass() == v2) {
+         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
+      } else {
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
+      }
+   }
+
+   return dst;
+}
+
+enum sgpr_extract_mode {
+   sgpr_extract_sext,
+   sgpr_extract_zext,
+   sgpr_extract_undef,
+};
+
+Temp extract_8_16_bit_sgpr_element(isel_context *ctx, Temp dst, nir_alu_src *src, sgpr_extract_mode mode)
+{
+   Temp vec = get_ssa_temp(ctx, src->src.ssa);
+   unsigned src_size = src->src.ssa->bit_size;
+   unsigned swizzle = src->swizzle[0];
+
+   if (vec.size() > 1) {
+      assert(src_size == 16);
+      vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
+      swizzle = swizzle & 1;
+   }
+
+   Builder bld(ctx->program, ctx->block);
+   unsigned offset = src_size * swizzle;
+   Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst;
+
+   if (mode == sgpr_extract_undef && swizzle == 0) {
+      bld.copy(Definition(tmp), vec);
+   } else if (mode == sgpr_extract_undef || (offset == 24 && mode == sgpr_extract_zext)) {
+      bld.sop2(aco_opcode::s_lshr_b32, Definition(tmp), bld.def(s1, scc), vec, Operand(offset));
+   } else if (src_size == 8 && swizzle == 0 && mode == sgpr_extract_sext) {
+      bld.sop1(aco_opcode::s_sext_i32_i8, Definition(tmp), vec);
+   } else if (src_size == 16 && swizzle == 0 && mode == sgpr_extract_sext) {
+      bld.sop1(aco_opcode::s_sext_i32_i16, Definition(tmp), vec);
+   } else {
+      aco_opcode op = mode == sgpr_extract_zext ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32;
+      bld.sop2(op, Definition(tmp), bld.def(s1, scc), vec, Operand((src_size << 16) | offset));
+   }
+
+   if (dst.regClass() == s2)
+      convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst);
+
+   return dst;
+}
+
  Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
  {
     if (src.src.ssa->num_components == 1 && src.swizzle[0] == 0 && size == 1)
@@ -526,23 +675,8 @@ Temp get_alu_src(struct isel_context *ctx, nir_alu_src src, unsigned size=1)
     if (elem_size < 4 && vec.type() == RegType::sgpr) {
        assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16);
        assert(size == 1);
-      unsigned swizzle = src.swizzle[0];
-      if (vec.size() > 1) {
-         assert(src.src.ssa->bit_size == 16);
-         vec = emit_extract_vector(ctx, vec, swizzle / 2, s1);
-         swizzle = swizzle & 1;
-      }
-      if (swizzle == 0)
-         return vec;
-
-      Temp dst{ctx->program->allocateId(), s1};
-      aco_ptr<SOP2_instruction> bfe{create_instruction<SOP2_instruction>(aco_opcode::s_bfe_u32, Format::SOP2, 2, 2)};
-      bfe->operands[0] = Operand(vec);
-      bfe->operands[1] = Operand(uint32_t((src.src.ssa->bit_size << 16) | (src.src.ssa->bit_size * swizzle)));
-      bfe->definitions[0] = Definition(dst);
-      bfe->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
-      ctx->block->instructions.emplace_back(std::move(bfe));
-      return dst;
+      return extract_8_16_bit_sgpr_element(
+         ctx, Temp(ctx->program->allocateId(), s1), &src, sgpr_extract_undef);
     }
  
     RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() : RegClass(vec.type(), elem_size / 4);
@@ -581,6 +715,8 @@ void emit_sop2_instruction(isel_context *ctx, nir_alu_instr *instr, aco_opcode o
     sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0]));
     sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1]));
     sop2->definitions[0] = Definition(dst);
+   if (instr->no_unsigned_wrap)
+      sop2->definitions[0].setNUW(true);
     if (writes_scc)
        sop2->definitions[1] = Definition(ctx->program->allocateId(), scc, s1);
     ctx->block->instructions.emplace_back(std::move(sop2));
@@ -832,9 +968,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
  
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        return;
     }
@@ -852,9 +986,7 @@ void emit_bcsel(isel_context *ctx, nir_alu_instr *instr, Temp dst)
           aco_opcode op = dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64;
           bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond)));
        } else {
-         fprintf(stderr, "Unimplemented uniform bcsel bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented uniform bcsel bit size");
        }
        return;
     }
@@ -979,7 +1111,8 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     if (ctx->options->chip_class >= GFX7)
        return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val);
  
-   /* GFX6 doesn't support V_FLOOR_F64, lower it. */
+   /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually
+    * lowered at NIR level for precision reasons). */
     Temp src0 = as_vgpr(ctx, val);
  
     Temp mask = bld.copy(bld.def(s1), Operand(3u)); /* isnan */
@@ -1005,68 +1138,10 @@ Temp emit_floor_f64(isel_context *ctx, Builder& bld, Definition dst, Temp val)
     return add->definitions[0].getTemp();
  }
  
-Temp convert_int(isel_context *ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, bool is_signed, Temp dst=Temp()) {
-   if (!dst.id()) {
-      if (dst_bits % 32 == 0 || src.type() == RegType::sgpr)
-         dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u));
-      else
-         dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword());
-   }
-
-   if (dst.bytes() == src.bytes() && dst_bits < src_bits)
-      return bld.copy(Definition(dst), src);
-   else if (dst.bytes() < src.bytes())
-      return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand(0u));
-
-   Temp tmp = dst;
-   if (dst_bits == 64)
-      tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1);
-
-   if (tmp == src) {
-   } else if (src.regClass() == s1) {
-      if (is_signed)
-         bld.sop1(src_bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, Definition(tmp), src);
-      else
-         bld.sop2(aco_opcode::s_and_b32, Definition(tmp), bld.def(s1, scc), Operand(src_bits == 8 ? 0xFFu : 0xFFFFu), src);
-   } else if (ctx->options->chip_class >= GFX8) {
-      assert(src_bits != 8 || src.regClass() == v1b);
-      assert(src_bits != 16 || src.regClass() == v2b);
-      aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>(aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)};
-      sdwa->operands[0] = Operand(src);
-      sdwa->definitions[0] = Definition(tmp);
-      if (is_signed)
-         sdwa->sel[0] = src_bits == 8 ? sdwa_sbyte : sdwa_sword;
-      else
-         sdwa->sel[0] = src_bits == 8 ? sdwa_ubyte : sdwa_uword;
-      sdwa->dst_sel = tmp.bytes() == 2 ? sdwa_uword : sdwa_udword;
-      bld.insert(std::move(sdwa));
-   } else {
-      assert(ctx->options->chip_class == GFX6 || ctx->options->chip_class == GFX7);
-      aco_opcode opcode = is_signed ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32;
-      bld.vop3(opcode, Definition(tmp), src, Operand(0u), Operand(src_bits == 8 ? 8u : 16u));
-   }
-
-   if (dst_bits == 64) {
-      if (is_signed && dst.regClass() == s2) {
-         Temp high = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand(31u));
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
-      } else if (is_signed && dst.regClass() == v2) {
-         Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand(31u), tmp);
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high);
-      } else {
-         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
-      }
-   }
-
-   return dst;
-}
-
  void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  {
     if (!instr->dest.dest.is_ssa) {
-      fprintf(stderr, "nir alu dst not in ssa: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "nir alu dst not in ssa");
        abort();
     }
     Builder bld(ctx->program, ctx->block);
@@ -1160,9 +1235,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64;
           bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1189,9 +1262,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1202,9 +1273,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp src = get_alu_src(ctx, instr->src[0]);
           bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, bld.vsub32(bld.def(v1), Operand(0u), src));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1232,9 +1301,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), neg, gtz);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1244,9 +1311,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1256,9 +1321,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1268,9 +1331,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1280,9 +1341,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1298,9 +1357,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1316,9 +1373,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1334,9 +1389,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1354,9 +1407,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1374,9 +1425,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1394,9 +1443,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s2) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1409,9 +1456,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (src.regClass() == s2) {
           bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1438,9 +1483,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp carry = bld.vsub32(Definition(msb), Operand(31u), Operand(msb_rev), true).def(1).getTemp();
           bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand((uint32_t)-1), carry);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1450,9 +1493,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0]));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1488,9 +1529,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1519,9 +1558,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), tmp, Operand((uint32_t) -1), carry);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1555,9 +1592,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), carry);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand(0u));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1591,9 +1626,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1626,9 +1659,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand(1u), borrow);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand(0u));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1639,9 +1670,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == s1) {
           emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1655,9 +1684,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                               as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
           bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1671,9 +1698,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                               as_vgpr(ctx, get_alu_src(ctx, instr->src[1])));
           bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1687,9 +1712,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v2) {
           bld.vop3(aco_opcode::v_mul_f64, Definition(dst), src0, src1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1703,9 +1726,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v2) {
           bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, src1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1728,9 +1749,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           VOP3A_instruction* sub = static_cast<VOP3A_instruction*>(add);
           sub->neg[1] = true;
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1750,9 +1769,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop3(aco_opcode::v_max_f64, Definition(dst), src0, src1);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1772,9 +1789,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop3(aco_opcode::v_min_f64, Definition(dst), src0, src1);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1784,9 +1799,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1796,9 +1809,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1808,9 +1819,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_f32, dst, ctx->block->fp_mode.must_flush_denorms32);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1818,9 +1827,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_u32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1828,9 +1835,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_u32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1838,9 +1843,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_u32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1848,9 +1851,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_max3_i32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1858,9 +1859,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_min3_i32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1868,9 +1867,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.size() == 1) {
           emit_vop3a_instruction(ctx, instr, aco_opcode::v_med3_i32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1883,8 +1880,10 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma);
        Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]);
        Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]);
-      sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, ma, Operand(0x3f000000u/*0.5*/));
-      tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, ma, Operand(0x3f000000u/*0.5*/));
+      sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
+                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma), Operand(0x3f000000u/*0.5*/));
+      tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1),
+                    bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma), Operand(0x3f000000u/*0.5*/));
        bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc);
        break;
     }
@@ -1910,15 +1909,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fneg: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
           bld.vop2(aco_opcode::v_xor_b32, Definition(dst), Operand(0x8000u), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
@@ -1932,15 +1931,15 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand(0x80000000u), upper);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
     case nir_op_fabs: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (dst.regClass() == v2b) {
+         if (ctx->block->fp_mode.must_flush_denorms16_64)
+            src = bld.vop2(aco_opcode::v_mul_f16, bld.def(v2b), Operand((uint16_t)0x3C00), as_vgpr(ctx, src));
           bld.vop2(aco_opcode::v_and_b32, Definition(dst), Operand(0x7FFFu), as_vgpr(ctx, src));
        } else if (dst.regClass() == v1) {
           if (ctx->block->fp_mode.must_flush_denorms32)
@@ -1954,9 +1953,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(0x7FFFFFFFu), upper);
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1973,9 +1970,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(add);
           vop3->clamp = true;
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -1986,9 +1981,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_log2(ctx, bld, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2002,9 +1995,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2014,9 +2005,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2030,9 +2019,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           /* Lowered at NIR level for precision reasons. */
           emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2044,9 +2031,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v2) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2059,9 +2044,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v2) {
           emit_floor_f64(ctx, bld, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2089,9 +2072,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2104,9 +2085,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v2) {
           emit_trunc_f64(ctx, bld, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2144,9 +2123,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
              bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1);
           }
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2170,9 +2147,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           aco_opcode opcode = instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32;
           bld.vop1(opcode, Definition(dst), tmp);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2186,9 +2161,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v2) {
           bld.vop3(aco_opcode::v_ldexp_f64, Definition(dst), as_vgpr(ctx, src0), src1);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2201,9 +2174,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v2) {
           bld.vop1(aco_opcode::v_frexp_mant_f64, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2218,9 +2189,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           bld.vop1(aco_opcode::v_frexp_exp_i32_f64, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2249,9 +2218,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
  
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(0u), upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2260,7 +2227,13 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        Temp src = get_alu_src(ctx, instr->src[0]);
        if (instr->src[0].src.ssa->bit_size == 64)
           src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src);
-      bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
+      if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne)
+         /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to
+          * keep value numbering and the scheduler simpler.
+          */
+         bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src);
+      else
+         bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src);
        break;
     }
     case nir_op_f2f16_rtz: {
@@ -2276,9 +2249,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2324,9 +2295,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2368,9 +2337,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand(32u));
           bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2409,9 +2376,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2430,9 +2395,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (instr->src[0].src.ssa->bit_size == 64) {
           emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2509,9 +2472,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2581,9 +2542,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2636,31 +2595,52 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
     case nir_op_i2i16:
     case nir_op_i2i32:
     case nir_op_i2i64: {
-      convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
-                  instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
+      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
+         /* no need to do the extract in get_alu_src() */
+         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ?
+                                  sgpr_extract_sext : sgpr_extract_undef;
+         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
+      } else {
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
+                     instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, true, dst);
+      }
        break;
     }
     case nir_op_u2u8:
     case nir_op_u2u16:
     case nir_op_u2u32:
     case nir_op_u2u64: {
-      convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
-                  instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
+      if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) {
+         /* no need to do the extract in get_alu_src() */
+         sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size ?
+                                  sgpr_extract_zext : sgpr_extract_undef;
+         extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode);
+      } else {
+         convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]),
+                     instr->src[0].src.ssa->bit_size, instr->dest.dest.ssa.bit_size, false, dst);
+      }
        break;
     }
     case nir_op_b2b32:
-   case nir_op_b2i32: {
+   case nir_op_b2i8:
+   case nir_op_b2i16:
+   case nir_op_b2i32:
+   case nir_op_b2i64: {
        Temp src = get_alu_src(ctx, instr->src[0]);
        assert(src.regClass() == bld.lm);
  
-      if (dst.regClass() == s1) {
+      Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst;
+      if (tmp.regClass() == s1) {
           // TODO: in a post-RA optimization, we can check if src is in VCC, and directly use VCCNZ
-         bool_to_scalar_condition(ctx, src, dst);
-      } else if (dst.regClass() == v1) {
-         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand(0u), Operand(1u), src);
+         bool_to_scalar_condition(ctx, src, tmp);
+      } else if (tmp.type() == RegType::vgpr) {
+         bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand(0u), Operand(1u), src);
        } else {
           unreachable("Invalid register class for b2i32");
        }
+
+      if (tmp != dst)
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand(0u));
        break;
     }
     case nir_op_b2b1:
@@ -2741,9 +2721,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
                       bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src0),
                       bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src1));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2751,9 +2729,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        if (dst.regClass() == v1) {
           bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), get_alu_src(ctx, instr->src[0]));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2763,9 +2739,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst),
                    bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), Operand(16u), as_vgpr(ctx, get_alu_src(ctx, instr->src[0]))));
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2806,9 +2780,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (dst.regClass() == v1) {
           bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2851,9 +2823,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
           bld.vop3(aco_opcode::v_bfi_b32, Definition(dst), bitmask, insert, base);
  
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -2926,9 +2896,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        } else if (src.regClass() == s2) {
           bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -3015,9 +2983,7 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
        break;
     }
     default:
-      fprintf(stderr, "Unknown NIR ALU instr: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unknown NIR ALU instr");
     }
  }
  
@@ -3083,8 +3049,7 @@ struct LoadEmitInfo {
  
     bool glc = false;
     unsigned swizzle_component_size = 0;
-   barrier_interaction barrier = barrier_none;
-   bool can_reorder = true;
+   memory_sync_info sync;
     Temp soffset = Temp(0, s1);
  };
  
@@ -3178,7 +3143,9 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
  
        /* align offset down if needed */
        Operand aligned_offset = offset;
+      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        if (need_to_align_offset) {
+         align = 4;
           Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp();
           if (offset.isConstant()) {
              aligned_offset = Operand(offset.constantValue() & 0xfffffffcu);
@@ -3198,7 +3165,6 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        Temp aligned_offset_tmp = aligned_offset.isTemp() ? aligned_offset.getTemp() :
                                  bld.copy(bld.def(s1), aligned_offset);
  
-      unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul;
        Temp val = callback(bld, info, aligned_offset_tmp, bytes_needed, align,
                            reduced_const_offset, byte_align ? Temp() : info->dst);
  
@@ -3260,7 +3226,7 @@ void emit_load(isel_context *ctx, Builder& bld, const LoadEmitInfo *info)
        if (num_tmps > 1) {
           aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
              aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)};
-         for (unsigned i = 0; i < num_vals; i++)
+         for (unsigned i = 0; i < num_tmps; i++)
              vec->operands[i] = Operand(tmp[i]);
           tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size));
           vec->definitions[0] = Definition(tmp[0]);
@@ -3386,10 +3352,12 @@ Temp lds_load_callback(Builder& bld, const LoadEmitInfo *info,
  
     RegClass rc = RegClass(RegType::vgpr, DIV_ROUND_UP(size, 4));
     Temp val = rc == info->dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc);
+   Instruction *instr;
     if (read2)
-      bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1);
     else
-      bld.ds(op, Definition(val), offset, m, const_offset);
+      instr = bld.ds(op, Definition(val), offset, m, const_offset);
+   static_cast<DS_instruction *>(instr)->sync = info->sync;
  
     if (size < 4)
        val = bld.pseudo(aco_opcode::p_extract_vector, bld.def(RegClass::get(RegType::vgpr, size)), val, Operand(0u));
@@ -3435,8 +3403,7 @@ Temp smem_load_callback(Builder& bld, const LoadEmitInfo *info,
     load->definitions[0] = Definition(val);
     load->glc = info->glc;
     load->dlc = info->glc && bld.program->chip_class >= GFX10;
-   load->barrier = info->barrier;
-   load->can_reorder = false; // FIXME: currently, it doesn't seem beneficial due to how our scheduler works
+   load->sync = info->sync;
     bld.insert(std::move(load));
     return val;
  }
@@ -3459,10 +3426,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  
     unsigned bytes_size = 0;
     aco_opcode op;
-   if (bytes_needed == 1) {
+   if (bytes_needed == 1 || align_ % 2) {
        bytes_size = 1;
        op = aco_opcode::buffer_load_ubyte;
-   } else if (bytes_needed == 2) {
+   } else if (bytes_needed == 2 || align_ % 4) {
        bytes_size = 2;
        op = aco_opcode::buffer_load_ushort;
     } else if (bytes_needed <= 4) {
@@ -3485,10 +3452,10 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
     mubuf->offen = (offset.type() == RegType::vgpr);
     mubuf->glc = info->glc;
     mubuf->dlc = info->glc && bld.program->chip_class >= GFX10;
-   mubuf->barrier = info->barrier;
-   mubuf->can_reorder = info->can_reorder;
+   mubuf->sync = info->sync;
     mubuf->offset = const_offset;
-   RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4));
+   mubuf->swizzled = info->swizzle_component_size != 0;
+   RegClass rc = RegClass::get(RegType::vgpr, bytes_size);
     Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc);
     mubuf->definitions[0] = Definition(val);
     bld.insert(std::move(mubuf));
@@ -3497,6 +3464,7 @@ Temp mubuf_load_callback(Builder& bld, const LoadEmitInfo *info,
  }
  
  static auto emit_mubuf_load = emit_load<mubuf_load_callback, true, true, 4096>;
+static auto emit_scratch_load = emit_load<mubuf_load_callback, false, true, 4096>;
  
  Temp get_gfx6_global_rsrc(Builder& bld, Temp addr)
  {
@@ -3548,7 +3516,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
        mubuf->offset = 0;
        mubuf->addr64 = offset.type() == RegType::vgpr;
        mubuf->disable_wqm = false;
-      mubuf->barrier = info->barrier;
+      mubuf->sync = info->sync;
        mubuf->definitions[0] = Definition(val);
        bld.insert(std::move(mubuf));
     } else {
@@ -3559,7 +3527,7 @@ Temp global_load_callback(Builder& bld, const LoadEmitInfo *info,
        flat->operands[1] = Operand(s1);
        flat->glc = info->glc;
        flat->dlc = info->glc && bld.program->chip_class >= GFX10;
-      flat->barrier = info->barrier;
+      flat->sync = info->sync;
        flat->offset = 0u;
        flat->definitions[0] = Definition(val);
        bld.insert(std::move(flat));
@@ -3581,8 +3549,7 @@ Temp load_lds(isel_context *ctx, unsigned elem_size_bytes, Temp dst,
     LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes};
     info.align_mul = align;
     info.align_offset = 0;
-   info.barrier = barrier_shared;
-   info.can_reorder = false;
+   info.sync = memory_sync_info(storage_shared);
     info.const_offset = base_offset;
     emit_lds_load(ctx, bld, &info);
  
@@ -3621,13 +3588,15 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        /* use allocated_vec if possible */
        auto it = ctx->allocated_vec.find(src.id());
        if (it != ctx->allocated_vec.end()) {
-         unsigned total_size = 0;
-         for (unsigned i = 0; it->second[i].bytes() && (i < NIR_MAX_VEC_COMPONENTS); i++)
-            total_size += it->second[i].bytes();
-         if (total_size != src.bytes())
+         if (!it->second[0].id())
              goto split;
-
           unsigned elem_size = it->second[0].bytes();
+         assert(src.bytes() % elem_size == 0);
+
+         for (unsigned i = 0; i < src.bytes() / elem_size; i++) {
+            if (!it->second[i].id())
+               goto split;
+         }
  
           for (unsigned i = 0; i < count; i++) {
              if (offsets[i] % elem_size || dst[i].bytes() % elem_size)
@@ -3659,10 +3628,11 @@ void split_store_data(isel_context *ctx, RegType dst_type, unsigned count, Temp
        }
     }
  
+   split:
+
     if (dst_type == RegType::sgpr)
        src = bld.as_uniform(src);
  
-   split:
     /* just split it */
     aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>(aco_opcode::p_split_vector, Format::PSEUDO, 1, count)};
     split->operands[0] = Operand(src);
@@ -3788,13 +3758,16 @@ void store_lds(isel_context *ctx, unsigned elem_size_bytes, Temp data, uint32_t
        }
        assert(inline_offset <= max_offset); /* offsets[i] shouldn't be large enough for this to happen */
  
+      Instruction *instr;
        if (write2) {
           Temp second_data = write_datas[second];
           inline_offset /= data.bytes();
-         bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
+         instr = bld.ds(op, address_offset, data, second_data, m, inline_offset, inline_offset + write2_off);
        } else {
-         bld.ds(op, address_offset, data, m, inline_offset);
+         instr = bld.ds(op, address_offset, data, m, inline_offset);
        }
+      static_cast<DS_instruction *>(instr)->sync =
+         memory_sync_info(storage_shared);
     }
  }
  
@@ -3939,7 +3912,8 @@ inline unsigned resolve_excess_vmem_const_offset(Builder &bld, Temp &voffset, un
  }
  
  void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata,
-                             unsigned const_offset = 0u, bool allow_reorder = true, bool slc = false)
+                             unsigned const_offset = 0u, memory_sync_info sync=memory_sync_info(),
+                             bool slc = false, bool swizzled = false)
  {
     assert(vdata.id());
     assert(vdata.size() != 3 || ctx->program->chip_class != GFX6);
@@ -3952,15 +3926,16 @@ void emit_single_mubuf_store(isel_context *ctx, Temp descriptor, Temp voffset, T
     Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1);
     Operand soffset_op = soffset.id() ? Operand(soffset) : Operand(0u);
     Builder::Result r = bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset,
-                                 /* offen */ !voffset_op.isUndefined(), /* idxen*/ false, /* addr64 */ false,
-                                 /* disable_wqm */ false, /* glc */ true, /* dlc*/ false, /* slc */ slc);
+                                 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled,
+                                 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true,
+                                 /* dlc*/ false, /* slc */ slc);
  
-   static_cast<MUBUF_instruction *>(r.instr)->can_reorder = allow_reorder;
+   static_cast<MUBUF_instruction *>(r.instr)->sync = sync;
  }
  
  void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset,
                                     unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask,
-                                   bool allow_combining = true, bool reorder = true, bool slc = false)
+                                   bool allow_combining = true, memory_sync_info sync=memory_sync_info(), bool slc = false)
  {
     Builder bld(ctx->program, ctx->block);
     assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8);
@@ -3975,7 +3950,7 @@ void store_vmem_mubuf(isel_context *ctx, Temp src, Temp descriptor, Temp voffset
  
     for (unsigned i = 0; i < write_count; i++) {
        unsigned const_offset = offsets[i] + base_const_offset;
-      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, reorder, slc);
+      emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, slc, !allow_combining);
     }
  }
  
@@ -4295,7 +4270,7 @@ void visit_store_ls_or_es_output(isel_context *ctx, nir_intrinsic_instr *instr)
        /* GFX6-8: ES stage is not merged into GS, data is passed from ES to GS in VMEM. */
        Temp esgs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_ESGS_VS * 16u));
        Temp es2gs_offset = get_arg(ctx, ctx->args->es2gs_offset);
-      store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, true, true);
+      store_vmem_mubuf(ctx, src, esgs_ring, offs.first, es2gs_offset, offs.second, elem_size_bytes, write_mask, false, memory_sync_info(), true);
     } else {
        Temp lds_base;
  
@@ -4380,7 +4355,7 @@ void visit_store_tcs_output(isel_context *ctx, nir_intrinsic_instr *instr, bool
  
        Temp hs_ring_tess_offchip = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, Operand(RING_HS_TESS_OFFCHIP * 16u));
        Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
-      store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, false);
+      store_vmem_mubuf(ctx, store_val, hs_ring_tess_offchip, vmem_offs.first, oc_lds, vmem_offs.second, elem_size_bytes, write_mask, true, memory_sync_info(storage_vmem_output));
     }
  
     if (write_to_lds) {
@@ -4415,9 +4390,7 @@ void visit_store_output(isel_context *ctx, nir_intrinsic_instr *instr)
         ctx->shader->info.stage == MESA_SHADER_GEOMETRY) {
        bool stored_to_temps = store_output_to_temps(ctx, instr);
        if (!stored_to_temps) {
-         fprintf(stderr, "Unimplemented output offset instruction:\n");
-         nir_print_instr(instr->src[1].ssa->parent_instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction");
           abort();
        }
     } else if (ctx->stage == vertex_es ||
@@ -4634,9 +4607,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
  
        nir_instr *off_instr = instr->src[0].ssa->parent_instr;
        if (off_instr->type != nir_instr_type_load_const) {
-         fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
-         nir_print_instr(off_instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(off_instr, "Unimplemented nir_intrinsic_load_input offset");
        }
        uint32_t offset = nir_instr_as_load_const(off_instr)->value[0].u32;
  
@@ -4785,15 +4756,13 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
           }
  
           if (use_mubuf) {
-            Instruction *mubuf = bld.mubuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_offset, false, true).instr;
-            static_cast<MUBUF_instruction*>(mubuf)->can_reorder = true;
+            bld.mubuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_offset, false, false, true).instr;
           } else {
-            Instruction *mtbuf = bld.mtbuf(opcode,
-                                           Definition(fetch_dst), list, fetch_index, soffset,
-                                           fetch_dfmt, nfmt, fetch_offset, false, true).instr;
-            static_cast<MTBUF_instruction*>(mtbuf)->can_reorder = true;
+            bld.mtbuf(opcode,
+                      Definition(fetch_dst), list, fetch_index, soffset,
+                      fetch_dfmt, nfmt, fetch_offset, false, true).instr;
           }
  
           emit_split_vector(ctx, fetch_dst, fetch_dst.size());
@@ -4850,9 +4819,7 @@ void visit_load_input(isel_context *ctx, nir_intrinsic_instr *instr)
        nir_instr *off_instr = instr->src[offset_idx].ssa->parent_instr;
        if (off_instr->type != nir_instr_type_load_const ||
            nir_instr_as_load_const(off_instr)->value[0].u32 != 0) {
-         fprintf(stderr, "Unimplemented nir_intrinsic_load_input offset\n");
-         nir_print_instr(off_instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(off_instr, "Unimplemented nir_intrinsic_load_input offset");
        }
  
        Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask);
@@ -5146,7 +5113,7 @@ void visit_load_resource(isel_context *ctx, nir_intrinsic_instr *instr)
  
  void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_size,
                   Temp dst, Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset,
-                 bool glc=false, bool readonly=true, bool allow_smem=true)
+                 bool glc=false, bool allow_smem=true, memory_sync_info sync=memory_sync_info())
  {
     Builder bld(ctx->program, ctx->block);
  
@@ -5156,8 +5123,7 @@ void load_buffer(isel_context *ctx, unsigned num_components, unsigned component_
  
     LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc};
     info.glc = glc;
-   info.barrier = readonly ? barrier_none : barrier_buffer;
-   info.can_reorder = readonly;
+   info.sync = sync;
     info.align_mul = align_mul;
     info.align_offset = align_offset;
     if (use_smem)
@@ -5233,7 +5199,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
     if (offset != 0) // TODO check if index != 0 as well
-      index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
+      index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand(offset), index);
     Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants));
     Temp vec = dst;
     bool trim = false;
@@ -5275,7 +5241,7 @@ void visit_load_push_constant(isel_context *ctx, nir_intrinsic_instr *instr)
        unreachable("unimplemented or forbidden load_push_constant.");
     }
  
-   bld.smem(op, Definition(vec), ptr, index);
+   static_cast<SMEM_instruction*>(bld.smem(op, Definition(vec), ptr, index).instr)->prevent_overflow = true;
  
     if (!aligned) {
        Operand byte_offset = index_cv ? Operand((offset + index_cv->u32) % 4) : Operand(index);
@@ -5319,7 +5285,7 @@ void visit_load_constant(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp offset = get_ssa_temp(ctx, instr->src[0].ssa);
     if (base && offset.type() == RegType::sgpr)
-      offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
+      offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, Operand(base));
     else if (base && offset.type() == RegType::vgpr)
        offset = bld.vadd32(bld.def(v1), Operand(base), offset);
  
@@ -5675,7 +5641,6 @@ static Temp adjust_sample_index_using_fmask(isel_context *ctx, bool da, std::vec
     load->unrm = true;
     load->da = da;
     load->dim = dim;
-   load->can_reorder = true; /* fmask images shouldn't be modified */
     ctx->block->instructions.emplace_back(std::move(load));
  
     Operand sample_index4;
@@ -5775,6 +5740,22 @@ static Temp get_image_coords(isel_context *ctx, const nir_intrinsic_instr *instr
  }
  
  
+memory_sync_info get_memory_sync_info(nir_intrinsic_instr *instr, storage_class storage, unsigned semantics)
+{
+   /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */
+   if (semantics & semantic_atomicrmw)
+      return memory_sync_info(storage, semantics);
+
+   unsigned access = nir_intrinsic_access(instr);
+
+   if (access & ACCESS_VOLATILE)
+      semantics |= semantic_volatile;
+   if (access & ACCESS_CAN_REORDER)
+      semantics |= semantic_can_reorder | semantic_private;
+
+   return memory_sync_info(storage, semantics);
+}
+
  void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
  {
     Builder bld(ctx->program, ctx->block);
@@ -5784,6 +5765,9 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
     bool is_array = glsl_sampler_type_is_array(type);
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
  
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+   unsigned access = var->data.access | nir_intrinsic_access(instr);
+
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
        unsigned num_channels = util_last_bit(mask);
@@ -5818,9 +5802,9 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
           tmp = {ctx->program->allocateId(), RegClass(RegType::vgpr, num_channels)};
        load->definitions[0] = Definition(tmp);
        load->idxen = true;
-      load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT);
+      load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT);
        load->dlc = load->glc && ctx->options->chip_class >= GFX10;
-      load->barrier = barrier_image;
+      load->sync = sync;
        ctx->block->instructions.emplace_back(std::move(load));
  
        expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, (1 << num_channels) - 1);
@@ -5846,13 +5830,13 @@ void visit_image_load(isel_context *ctx, nir_intrinsic_instr *instr)
     load->operands[1] = Operand(s4); /* no sampler */
     load->operands[2] = Operand(coords);
     load->definitions[0] = Definition(tmp);
-   load->glc = var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
+   load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0;
     load->dlc = load->glc && ctx->options->chip_class >= GFX10;
     load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
     load->dmask = dmask;
     load->unrm = true;
     load->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
-   load->barrier = barrier_image;
+   load->sync = sync;
     ctx->block->instructions.emplace_back(std::move(load));
  
     expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, dmask);
@@ -5867,7 +5851,9 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
     bool is_array = glsl_sampler_type_is_array(type);
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa));
  
-   bool glc = ctx->options->chip_class == GFX6 || var->data.access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0);
+   unsigned access = var->data.access | nir_intrinsic_access(instr);
+   bool glc = ctx->options->chip_class == GFX6 || access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) ? 1 : 0;
  
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), ACO_DESC_BUFFER, nullptr, true, true);
@@ -5898,7 +5884,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
        store->glc = glc;
        store->dlc = false;
        store->disable_wqm = true;
-      store->barrier = barrier_image;
+      store->sync = sync;
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(store));
        return;
@@ -5922,7 +5908,7 @@ void visit_image_store(isel_context *ctx, nir_intrinsic_instr *instr)
     store->unrm = true;
     store->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
     store->disable_wqm = true;
-   store->barrier = barrier_image;
+   store->sync = sync;
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(store));
     return;
@@ -6000,6 +5986,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     }
  
     Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw);
  
     if (dim == GLSL_SAMPLER_DIM_BUF) {
        Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
@@ -6017,7 +6004,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        mubuf->glc = return_previous;
        mubuf->dlc = false; /* Not needed for atomics */
        mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_image;
+      mubuf->sync = sync;
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
        return;
@@ -6038,7 +6025,7 @@ void visit_image_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     mimg->unrm = true;
     mimg->da = should_declare_array(ctx, dim, glsl_sampler_type_is_array(type));
     mimg->disable_wqm = true;
-   mimg->barrier = barrier_image;
+   mimg->sync = sync;
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(mimg));
     return;
@@ -6102,7 +6089,6 @@ void visit_image_size(isel_context *ctx, nir_intrinsic_instr *instr)
     mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array);
     mimg->dmask = (1 << instr->dest.ssa.num_components) - 1;
     mimg->da = glsl_sampler_type_is_array(type);
-   mimg->can_reorder = true;
     Definition& def = mimg->definitions[0];
     ctx->block->instructions.emplace_back(std::move(mimg));
  
@@ -6157,7 +6143,8 @@ void visit_load_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     allow_smem |= ((access & ACCESS_RESTRICT) && (access & ACCESS_NON_WRITEABLE)) || (access & ACCESS_CAN_REORDER);
  
     load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa),
-               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, false, allow_smem);
+               nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem,
+               get_memory_sync_info(instr, storage_buffer, 0));
  }
  
  void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -6171,6 +6158,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     Temp rsrc = convert_pointer_to_64_bit(ctx, get_ssa_temp(ctx, instr->src[1].ssa));
     rsrc = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), rsrc, Operand(0u));
  
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
     bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
     uint32_t flags = get_all_buffer_resource_flags(ctx, instr->src[1].ssa, nir_intrinsic_access(instr));
     /* GLC bypasses VMEM/SMEM caches, so GLC SMEM loads/stores are coherent with GLC VMEM loads/stores
@@ -6180,6 +6168,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
  
     bool smem = !nir_src_is_divergent(instr->src[2]) &&
                 ctx->options->chip_class >= GFX8 &&
+               ctx->options->chip_class < GFX10_3 &&
                 (elem_size_bytes >= 4 || can_subdword_ssbo_store_use_smem(instr)) &&
                 allow_smem;
     if (smem)
@@ -6201,8 +6190,8 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           aco_ptr<SMEM_instruction> store{create_instruction<SMEM_instruction>(op, Format::SMEM, 3, 0)};
           store->operands[0] = Operand(rsrc);
           if (offsets[i]) {
-            Temp off = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
-                                offset, Operand(offsets[i]));
+            Temp off = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc),
+                                      offset, Operand(offsets[i]));
              store->operands[1] = Operand(off);
           } else {
              store->operands[1] = Operand(offset);
@@ -6213,7 +6202,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
           ctx->block->instructions.emplace_back(std::move(store));
           ctx->program->wb_smem_l1_on_end = true;
           if (op == aco_opcode::p_fs_buffer_store_smem) {
@@ -6231,7 +6220,7 @@ void visit_store_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
           store->glc = glc;
           store->dlc = false;
           store->disable_wqm = true;
-         store->barrier = barrier_buffer;
+         store->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(store));
        }
@@ -6322,7 +6311,7 @@ void visit_atomic_ssbo(isel_context *ctx, nir_intrinsic_instr *instr)
     mubuf->glc = return_previous;
     mubuf->dlc = false; /* Not needed for atomics */
     mubuf->disable_wqm = true;
-   mubuf->barrier = barrier_buffer;
+   mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
     ctx->program->needs_exact = true;
     ctx->block->instructions.emplace_back(std::move(mubuf));
  }
@@ -6347,8 +6336,7 @@ void visit_load_global(isel_context *ctx, nir_intrinsic_instr *instr)
     info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT);
     info.align_mul = nir_intrinsic_align_mul(instr);
     info.align_offset = nir_intrinsic_align_offset(instr);
-   info.barrier = barrier_buffer;
-   info.can_reorder = false;
+   info.sync = get_memory_sync_info(instr, storage_buffer, 0);
     /* VMEM stores don't update the SMEM cache and it's difficult to prove that
      * it's safe to use SMEM */
     bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE;
@@ -6368,6 +6356,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
  
     Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
     Temp addr = get_ssa_temp(ctx, instr->src[1].ssa);
+   memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0);
     bool glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE);
  
     if (ctx->options->chip_class >= GFX7)
@@ -6433,7 +6422,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           flat->dlc = false;
           flat->offset = offset;
           flat->disable_wqm = true;
-         flat->barrier = barrier_buffer;
+         flat->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(flat));
        } else {
@@ -6453,7 +6442,7 @@ void visit_store_global(isel_context *ctx, nir_intrinsic_instr *instr)
           mubuf->offset = offsets[i];
           mubuf->addr64 = addr.type() == RegType::vgpr;
           mubuf->disable_wqm = true;
-         mubuf->barrier = barrier_buffer;
+         mubuf->sync = sync;
           ctx->program->needs_exact = true;
           ctx->block->instructions.emplace_back(std::move(mubuf));
        }
@@ -6546,7 +6535,7 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        flat->dlc = false; /* Not needed for atomics */
        flat->offset = 0;
        flat->disable_wqm = true;
-      flat->barrier = barrier_buffer;
+      flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(flat));
     } else {
@@ -6613,33 +6602,57 @@ void visit_global_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
        mubuf->offset = 0;
        mubuf->addr64 = addr.type() == RegType::vgpr;
        mubuf->disable_wqm = true;
-      mubuf->barrier = barrier_buffer;
+      mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw);
        ctx->program->needs_exact = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
     }
  }
  
-void emit_memory_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
-   Builder bld(ctx->program, ctx->block);
-   switch(instr->intrinsic) {
-      case nir_intrinsic_group_memory_barrier:
-      case nir_intrinsic_memory_barrier:
-         bld.barrier(aco_opcode::p_memory_barrier_common);
-         break;
-      case nir_intrinsic_memory_barrier_buffer:
-         bld.barrier(aco_opcode::p_memory_barrier_buffer);
-         break;
-      case nir_intrinsic_memory_barrier_image:
-         bld.barrier(aco_opcode::p_memory_barrier_image);
-         break;
-      case nir_intrinsic_memory_barrier_tcs_patch:
-      case nir_intrinsic_memory_barrier_shared:
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         break;
-      default:
-         unreachable("Unimplemented memory barrier intrinsic");
-         break;
+sync_scope translate_nir_scope(nir_scope scope)
+{
+   switch (scope) {
+   case NIR_SCOPE_NONE:
+   case NIR_SCOPE_INVOCATION:
+      return scope_invocation;
+   case NIR_SCOPE_SUBGROUP:
+      return scope_subgroup;
+   case NIR_SCOPE_WORKGROUP:
+      return scope_workgroup;
+   case NIR_SCOPE_QUEUE_FAMILY:
+      return scope_queuefamily;
+   case NIR_SCOPE_DEVICE:
+      return scope_device;
     }
+   unreachable("invalid scope");
+}
+
+void emit_scoped_barrier(isel_context *ctx, nir_intrinsic_instr *instr) {
+   Builder bld(ctx->program, ctx->block);
+
+   unsigned semantics = 0;
+   unsigned storage = 0;
+   sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr));
+   sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr));
+
+   unsigned nir_storage = nir_intrinsic_memory_modes(instr);
+   if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global))
+      storage |= storage_buffer | storage_image; //TODO: split this when NIR gets nir_var_mem_image
+   if (ctx->shader->info.stage == MESA_SHADER_COMPUTE && (nir_storage & nir_var_mem_shared))
+      storage |= storage_shared;
+   if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL && (nir_storage & nir_var_shader_out))
+      storage |= storage_shared;
+
+   unsigned nir_semantics = nir_intrinsic_memory_semantics(instr);
+   if (nir_semantics & NIR_MEMORY_ACQUIRE)
+      semantics |= semantic_acquire | semantic_release;
+   if (nir_semantics & NIR_MEMORY_RELEASE)
+      semantics |= semantic_acquire | semantic_release;
+
+   assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE)));
+
+   bld.barrier(aco_opcode::p_barrier,
+               memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope),
+               exec_scope);
  }
  
  void visit_load_shared(isel_context *ctx, nir_intrinsic_instr *instr)
@@ -6737,6 +6750,12 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
           op64_rtn = aco_opcode::ds_cmpst_rtn_b64;
           num_operands = 4;
           break;
+      case nir_intrinsic_shared_atomic_fadd:
+         op32 = aco_opcode::ds_add_f32;
+         op32_rtn = aco_opcode::ds_add_rtn_f32;
+         op64 = aco_opcode::num_opcodes;
+         op64_rtn = aco_opcode::num_opcodes;
+         break;
        default:
           unreachable("Unhandled shared atomic intrinsic");
     }
@@ -6776,6 +6795,7 @@ void visit_shared_atomic(isel_context *ctx, nir_intrinsic_instr *instr)
     ds->offset0 = offset;
     if (return_previous)
        ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa));
+   ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw);
     ctx->block->instructions.emplace_back(std::move(ds));
  }
  
@@ -6787,7 +6807,7 @@ Temp get_scratch_resource(isel_context *ctx)
        scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));
  
     uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
-                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);;
+                        S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2);
  
     if (ctx->program->chip_class >= GFX10) {
        rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) |
@@ -6798,9 +6818,9 @@ Temp get_scratch_resource(isel_context *ctx)
                     S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
     }
  
-   /* older generations need element size = 16 bytes. element size removed in GFX9 */
+   /* older generations need element size = 4 bytes. element size removed in GFX9 */
     if (ctx->program->chip_class <= GFX8)
-      rsrc_conf |= S_008F0C_ELEMENT_SIZE(3);
+      rsrc_conf |= S_008F0C_ELEMENT_SIZE(1);
  
     return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand(-1u), Operand(rsrc_conf));
  }
@@ -6815,10 +6835,10 @@ void visit_load_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
                          instr->dest.ssa.bit_size / 8u, rsrc};
     info.align_mul = nir_intrinsic_align_mul(instr);
     info.align_offset = nir_intrinsic_align_offset(instr);
-   info.swizzle_component_size = 16;
-   info.can_reorder = false;
+   info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0;
+   info.sync = memory_sync_info(storage_scratch, semantic_private);
     info.soffset = ctx->program->scratch_offset;
-   emit_mubuf_load(ctx, bld, &info);
+   emit_scratch_load(ctx, bld, &info);
  }
  
  void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
@@ -6833,12 +6853,14 @@ void visit_store_scratch(isel_context *ctx, nir_intrinsic_instr *instr) {
     unsigned write_count = 0;
     Temp write_datas[32];
     unsigned offsets[32];
+   unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16;
     split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask,
-                      16, &write_count, write_datas, offsets);
+                      swizzle_component_size, &write_count, write_datas, offsets);
  
     for (unsigned i = 0; i < write_count; i++) {
        aco_opcode op = get_buffer_store_op(false, write_datas[i].bytes());
-      bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true);
+      Instruction *instr = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], offsets[i], true, true);
+      static_cast<MUBUF_instruction *>(instr)->sync = memory_sync_info(storage_scratch, semantic_private);
     }
  }
  
@@ -6952,8 +6974,7 @@ void visit_emit_vertex_with_counter(isel_context *ctx, nir_intrinsic_instr *inst
              mtbuf->offset = const_offset;
              mtbuf->glc = true;
              mtbuf->slc = true;
-            mtbuf->barrier = barrier_gs_data;
-            mtbuf->can_reorder = true;
+            mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder);
              bld.insert(std::move(mtbuf));
           }
  
@@ -7105,9 +7126,7 @@ void emit_uniform_subgroup(isel_context *ctx, nir_intrinsic_instr *instr, Temp s
     } else if (src.regClass() == s2) {
        bld.sop1(aco_opcode::s_mov_b64, dst, src);
     } else {
-      fprintf(stderr, "Unimplemented NIR instr bit size: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unimplemented NIR instr bit size");
     }
  }
  
@@ -7145,10 +7164,11 @@ void emit_interp_center(isel_context *ctx, Temp dst, Temp pos1, Temp pos2)
     }
  
     /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */
-   Temp tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_1, pos1, p1);
-   Temp tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddx_2, pos1, p2);
-   tmp1 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_1, pos2, tmp1);
-   tmp2 = bld.vop3(aco_opcode::v_mad_f32, bld.def(v1), ddy_2, pos2, tmp2);
+   aco_opcode mad = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32;
+   Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1);
+   Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2);
+   tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1);
+   tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2);
     Temp wqm1 = bld.tmp(v1);
     emit_wqm(ctx, tmp1, wqm1, true);
     Temp wqm2 = bld.tmp(v1);
@@ -7219,6 +7239,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
        nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
        Temp private_segment_buffer = ctx->program->private_segment_buffer;
+      //TODO: bounds checking?
        if (addr.type() == RegType::sgpr) {
           Operand offset;
           if (const_addr) {
@@ -7277,8 +7298,6 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           load->glc = false;
           load->dlc = false;
           load->disable_wqm = false;
-         load->barrier = barrier_none;
-         load->can_reorder = true;
           ctx->block->instructions.emplace_back(std::move(load));
        }
  
@@ -7391,6 +7410,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_shared_atomic_xor:
     case nir_intrinsic_shared_atomic_exchange:
     case nir_intrinsic_shared_atomic_comp_swap:
+   case nir_intrinsic_shared_atomic_fadd:
        visit_shared_atomic(ctx, instr);
        break;
     case nir_intrinsic_image_deref_load:
@@ -7459,27 +7479,8 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
     case nir_intrinsic_get_buffer_size:
        visit_get_buffer_size(ctx, instr);
        break;
-   case nir_intrinsic_control_barrier: {
-      if (ctx->program->chip_class == GFX6 && ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) {
-         /* GFX6 only (thanks to a hw bug workaround):
-          * The real barrier instruction isn’t needed, because an entire patch
-          * always fits into a single wave.
-          */
-         break;
-      }
-
-      if (ctx->program->workgroup_size > ctx->program->wave_size)
-         bld.sopp(aco_opcode::s_barrier);
-
-      break;
-   }
-   case nir_intrinsic_memory_barrier_tcs_patch:
-   case nir_intrinsic_group_memory_barrier:
-   case nir_intrinsic_memory_barrier:
-   case nir_intrinsic_memory_barrier_buffer:
-   case nir_intrinsic_memory_barrier_image:
-   case nir_intrinsic_memory_barrier_shared:
-      emit_memory_barrier(ctx, instr);
+   case nir_intrinsic_scoped_barrier:
+      emit_scoped_barrier(ctx, instr);
        break;
     case nir_intrinsic_load_num_work_groups: {
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
@@ -7556,9 +7557,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) {
           bld.vopc(aco_opcode::v_cmp_lg_u64, lanemask_tmp, Operand(0u), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        if (dst.size() != bld.lm.size()) {
           /* Wave32 with ballot size set to 64 */
@@ -7610,9 +7609,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
              tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand(1u), tmp);
              emit_wqm(ctx, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), tmp), dst);
           } else {
-            fprintf(stderr, "Unimplemented NIR instr bit size: ");
-            nir_print_instr(&instr->instr, stderr);
-            fprintf(stderr, "\n");
+            isel_err(&instr->instr, "Unimplemented NIR instr bit size");
           }
        }
        break;
@@ -7650,9 +7647,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        } else if (src.regClass() == s2) {
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -7823,9 +7818,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
              bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
              emit_split_vector(ctx, dst, 2);
           } else {
-            fprintf(stderr, "Unimplemented NIR instr bit size: ");
-            nir_print_instr(&instr->instr, stderr);
-            fprintf(stderr, "\n");
+            isel_err(&instr->instr, "Unimplemented NIR instr bit size");
           }
        }
        break;
@@ -7903,9 +7896,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -7917,21 +7908,29 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        }
        Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
        uint32_t mask = nir_intrinsic_swizzle_mask(instr);
-      if (dst.regClass() == v1) {
-         emit_wqm(ctx,
-                  bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false),
-                  dst);
+      if (instr->dest.ssa.bit_size == 1) {
+         assert(src.regClass() == bld.lm);
+         src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand(0u), Operand((uint32_t)-1), src);
+         src = emit_masked_swizzle(ctx, bld, src, mask);
+         Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand(0u), src);
+         emit_wqm(ctx, tmp, dst);
+      } else if (dst.regClass() == v1b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v2b) {
+         Temp tmp = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask));
+         emit_extract_vector(ctx, tmp, 0, dst);
+      } else if (dst.regClass() == v1) {
+         emit_wqm(ctx, emit_masked_swizzle(ctx, bld, src, mask), dst);
        } else if (dst.regClass() == v2) {
           Temp lo = bld.tmp(v1), hi = bld.tmp(v1);
           bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src);
-         lo = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, mask, 0, false));
-         hi = emit_wqm(ctx, bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, mask, 0, false));
+         lo = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, lo, mask));
+         hi = emit_wqm(ctx, emit_masked_swizzle(ctx, bld, hi, mask));
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -7953,9 +7952,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
           bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi);
           emit_split_vector(ctx, dst, 2);
        } else {
-         fprintf(stderr, "Unimplemented NIR instr bit size: ");
-         nir_print_instr(&instr->instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(&instr->instr, "Unimplemented NIR instr bit size");
        }
        break;
     }
@@ -8009,11 +8006,18 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        break;
     }
     case nir_intrinsic_shader_clock: {
-      aco_opcode opcode =
-         nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
-            aco_opcode::s_memrealtime : aco_opcode::s_memtime;
-      bld.smem(opcode, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), false);
-      emit_split_vector(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 2);
+      Temp dst = get_ssa_temp(ctx, &instr->dest.ssa);
+      if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && ctx->options->chip_class >= GFX10_3) {
+         /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */
+         Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29);
+         bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand(0u));
+      } else {
+         aco_opcode opcode =
+            nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE ?
+               aco_opcode::s_memrealtime : aco_opcode::s_memtime;
+         bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile));
+      }
+      emit_split_vector(ctx, dst, 2);
        break;
     }
     case nir_intrinsic_load_vertex_id_zero_base: {
@@ -8099,9 +8103,7 @@ void visit_intrinsic(isel_context *ctx, nir_intrinsic_instr *instr)
        break;
     }
     default:
-      fprintf(stderr, "Unimplemented intrinsic instr: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unimplemented intrinsic instr");
        abort();
  
        break;
@@ -8227,6 +8229,8 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
  {
     Builder bld(ctx->program, ctx->block);
     Temp ma, tc, sc, id;
+   aco_opcode madak = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32;
+   aco_opcode madmk = ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32;
  
     if (is_array) {
        coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]);
@@ -8247,11 +8251,11 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
  
     sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
     if (!is_deriv)
-      sc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
+      sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand(0x3fc00000u/*1.5*/));
  
     tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]);
     if (!is_deriv)
-      tc = bld.vop2(aco_opcode::v_madak_f32, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
+      tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand(0x3fc00000u/*1.5*/));
  
     id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]);
  
@@ -8282,7 +8286,7 @@ void prepare_cube_coords(isel_context *ctx, std::vector<Temp>& coords, Temp* ddx
     }
  
     if (is_array)
-      id = bld.vop2(aco_opcode::v_madmk_f32, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
+      id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand(0x41000000u/*8.0*/));
     coords.resize(3);
     coords[0] = sc;
     coords[1] = tc;
@@ -8593,7 +8597,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->da = da;
        tex->definitions[0] = Definition(tmp_dst);
        tex->dim = dim;
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
  
        if (div_by_6) {
@@ -8626,7 +8629,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->da = da;
        Temp size = bld.tmp(v2);
        tex->definitions[0] = Definition(size);
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
        emit_split_vector(ctx, size, size.size());
  
@@ -8728,7 +8730,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        mubuf->operands[2] = Operand((uint32_t) 0);
        mubuf->definitions[0] = Definition(tmp_dst);
        mubuf->idxen = true;
-      mubuf->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(mubuf));
  
        expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, (1 << last_bit) - 1);
@@ -8777,7 +8778,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
        tex->unrm = true;
        tex->da = da;
        tex->definitions[0] = Definition(tmp_dst);
-      tex->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(tex));
  
        if (instr->op == nir_texop_samples_identical) {
@@ -8921,7 +8921,6 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
     tex->dmask = dmask;
     tex->da = da;
     tex->definitions[0] = Definition(tmp_dst);
-   tex->can_reorder = true;
     ctx->block->instructions.emplace_back(std::move(tex));
  
     if (tg4_integer_cube_workaround) {
@@ -8949,13 +8948,19 @@ void visit_tex(isel_context *ctx, nir_tex_instr *instr)
  }
  
  
-Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc)
+Operand get_phi_operand(isel_context *ctx, nir_ssa_def *ssa, RegClass rc, bool logical)
  {
     Temp tmp = get_ssa_temp(ctx, ssa);
-   if (ssa->parent_instr->type == nir_instr_type_ssa_undef)
+   if (ssa->parent_instr->type == nir_instr_type_ssa_undef) {
        return Operand(rc);
-   else
+   } else if (logical && ssa->bit_size == 1 && ssa->parent_instr->type == nir_instr_type_load_const) {
+      if (ctx->program->wave_size == 64)
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX : 0u);
+      else
+         return Operand(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX : 0u);
+   } else {
        return Operand(tmp);
+   }
  }
  
  void visit_phi(isel_context *ctx, nir_phi_instr *instr)
@@ -8998,7 +9003,7 @@ void visit_phi(isel_context *ctx, nir_phi_instr *instr)
        if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size())
           continue;
        cur_pred_idx++;
-      Operand op = get_phi_operand(ctx, src.second, dst.regClass());
+      Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical);
        operands[num_operands++] = op;
        num_defined += !op.isUndefined();
     }
@@ -9165,9 +9170,7 @@ void visit_jump(isel_context *ctx, nir_jump_instr *instr)
        }
        break;
     default:
-      fprintf(stderr, "Unknown NIR jump instr: ");
-      nir_print_instr(&instr->instr, stderr);
-      fprintf(stderr, "\n");
+      isel_err(&instr->instr, "Unknown NIR jump instr");
        abort();
     }
  
@@ -9225,9 +9228,7 @@ void visit_block(isel_context *ctx, nir_block *block)
           visit_jump(ctx, nir_instr_as_jump(instr));
           break;
        default:
-         fprintf(stderr, "Unknown NIR instr type: ");
-         nir_print_instr(instr, stderr);
-         fprintf(stderr, "\n");
+         isel_err(instr, "Unknown NIR instr type");
           //abort();
        }
     }
@@ -9783,10 +9784,10 @@ static bool export_vs_varying(isel_context *ctx, int slot, bool is_pos, int *nex
        else
           exp->operands[i] = Operand(v1);
     }
-   /* Navi10-14 skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
+   /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang.
      * Setting valid_mask=1 prevents it and has no other effect.
      */
-   exp->valid_mask = ctx->options->chip_class >= GFX10 && is_pos && *next_pos == 0;
+   exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0;
     exp->done = false;
     exp->compressed = false;
     if (is_pos)
@@ -9828,7 +9829,7 @@ static void export_vs_psiz_layer_viewport(isel_context *ctx, int *next_pos)
           exp->enabled_mask |= 0x4;
        }
     }
-   exp->valid_mask = ctx->options->chip_class >= GFX10 && *next_pos == 0;
+   exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0;
     exp->done = false;
     exp->compressed = false;
     exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++;
@@ -10198,6 +10199,13 @@ static void create_fs_exports(isel_context *ctx)
        create_null_export(ctx);
  }
  
+static void create_workgroup_barrier(Builder& bld)
+{
+   bld.barrier(aco_opcode::p_barrier,
+               memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup),
+               scope_workgroup);
+}
+
  static void write_tcs_tess_factors(isel_context *ctx)
  {
     unsigned outer_comps;
@@ -10222,9 +10230,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
  
     Builder bld(ctx->program, ctx->block);
  
-   bld.barrier(aco_opcode::p_memory_barrier_shared);
-   if (unlikely(ctx->program->chip_class != GFX6 && ctx->program->workgroup_size > ctx->program->wave_size))
-      bld.sopp(aco_opcode::s_barrier);
+   create_workgroup_barrier(bld);
  
     Temp tcs_rel_ids = get_arg(ctx, ctx->args->ac.tcs_rel_ids);
     Temp invocation_id = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), tcs_rel_ids, Operand(8u), Operand(5u));
@@ -10274,8 +10280,8 @@ static void write_tcs_tess_factors(isel_context *ctx)
        Temp control_word = bld.copy(bld.def(v1), Operand(0x80000000u));
        bld.mubuf(aco_opcode::buffer_store_dword,
                  /* SRSRC */ hs_ring_tess_factor, /* VADDR */ Operand(v1), /* SOFFSET */ tf_base, /* VDATA */ control_word,
-                /* immediate OFFSET */ 0, /* OFFEN */ false, /* idxen*/ false, /* addr64 */ false,
-                /* disable_wqm */ false, /* glc */ true);
+                /* immediate OFFSET */ 0, /* OFFEN */ false, /* swizzled */ false, /* idxen*/ false,
+                /* addr64 */ false, /* disable_wqm */ false, /* glc */ true);
        tf_const_offset += 4;
  
        begin_divergent_if_else(ctx, &ic_rel_patch_id_is_zero);
@@ -10285,7 +10291,7 @@ static void write_tcs_tess_factors(isel_context *ctx)
  
     assert(stride == 2 || stride == 4 || stride == 6);
     Temp tf_vec = create_vec_from_array(ctx, out, stride, RegType::vgpr, 4u);
-   store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, false);
+   store_vmem_mubuf(ctx, tf_vec, hs_ring_tess_factor, byte_offset, tf_base, tf_const_offset, 4, (1 << stride) - 1, true, memory_sync_info());
  
     /* Store to offchip for TES to read - only if TES reads them */
     if (ctx->args->options->key.tcs.tes_reads_tess_factors) {
@@ -10293,11 +10299,11 @@ static void write_tcs_tess_factors(isel_context *ctx)
        Temp oc_lds = get_arg(ctx, ctx->args->oc_lds);
  
        std::pair<Temp, unsigned> vmem_offs_outer = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_out_loc);
-      store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, false);
+      store_vmem_mubuf(ctx, tf_outer_vec, hs_ring_tess_offchip, vmem_offs_outer.first, oc_lds, vmem_offs_outer.second, 4, (1 << outer_comps) - 1, true, memory_sync_info(storage_vmem_output));
  
        if (likely(inner_comps)) {
           std::pair<Temp, unsigned> vmem_offs_inner = get_tcs_per_patch_output_vmem_offset(ctx, nullptr, ctx->tcs_tess_lvl_in_loc);
-         store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, false);
+         store_vmem_mubuf(ctx, tf_inner_vec, hs_ring_tess_offchip, vmem_offs_inner.first, oc_lds, vmem_offs_inner.second, 4, (1 << inner_comps) - 1, true, memory_sync_info(storage_vmem_output));
        }
     }
  
@@ -10383,7 +10389,6 @@ static void emit_stream_output(isel_context *ctx,
        store->glc = true;
        store->dlc = false;
        store->slc = true;
-      store->can_reorder = true;
        ctx->block->instructions.emplace_back(std::move(store));
     }
  }
@@ -10803,8 +10808,7 @@ void ngg_emit_nogs_output(isel_context *ctx)
  
        if (ctx->stage == ngg_vertex_gs) {
           /* Wait for GS threads to store primitive ID in LDS. */
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
  
           /* Calculate LDS address where the GS threads stored the primitive ID. */
           Temp wave_id_in_tg = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc),
@@ -10888,8 +10892,7 @@ void select_program(Program *program,
        if (i) {
           Builder bld(ctx.program, ctx.block);
  
-         bld.barrier(aco_opcode::p_memory_barrier_shared);
-         bld.sopp(aco_opcode::s_barrier);
+         create_workgroup_barrier(bld);
  
           if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) {
              ctx.gs_wave_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1, m0), bld.def(s1, scc), get_arg(&ctx, args->merged_wave_info), Operand((8u << 16) | 16u));
@@ -10912,7 +10915,8 @@ void select_program(Program *program,
           ngg_emit_nogs_output(&ctx);
        } else if (nir->info.stage == MESA_SHADER_GEOMETRY) {
           Builder bld(ctx.program, ctx.block);
-         bld.barrier(aco_opcode::p_memory_barrier_gs_data);
+         bld.barrier(aco_opcode::p_barrier,
+                     memory_sync_info(storage_vmem_output, semantic_release, scope_device));
           bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, sendmsg_gs_done(false, false, 0));
        } else if (nir->info.stage == MESA_SHADER_TESS_CTRL) {
           write_tcs_tess_factors(&ctx);
@@ -10944,7 +10948,7 @@ void select_program(Program *program,
     ctx.block->kind |= block_kind_uniform;
     Builder bld(ctx.program, ctx.block);
     if (ctx.program->wb_smem_l1_on_end)
-      bld.smem(aco_opcode::s_dcache_wb, false);
+      bld.smem(aco_opcode::s_dcache_wb, memory_sync_info(storage_buffer, semantic_volatile));
     bld.sopp(aco_opcode::s_endpgm);
  
     cleanup_cfg(program);
@@ -10956,16 +10960,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
  {
     isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true);
  
-   program->next_fp_mode.preserve_signed_zero_inf_nan32 = false;
-   program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = false;
-   program->next_fp_mode.must_flush_denorms32 = false;
-   program->next_fp_mode.must_flush_denorms16_64 = false;
-   program->next_fp_mode.care_about_round32 = false;
-   program->next_fp_mode.care_about_round16_64 = false;
-   program->next_fp_mode.denorm16_64 = fp_denorm_keep;
-   program->next_fp_mode.denorm32 = 0;
-   program->next_fp_mode.round32 = fp_round_ne;
-   program->next_fp_mode.round16_64 = fp_round_ne;
     ctx.block->fp_mode = program->next_fp_mode;
  
     add_startpgm(&ctx);
@@ -11039,8 +11033,6 @@ void select_gs_copy_shader(Program *program, struct nir_shader *gs_shader,
              mubuf->glc = true;
              mubuf->slc = true;
              mubuf->dlc = args->options->chip_class >= GFX10;
-            mubuf->barrier = barrier_none;
-            mubuf->can_reorder = true;
  
              ctx.outputs.mask[i] |= 1 << j;
              ctx.outputs.temps[i * 4u + j] = mubuf->definitions[0].getTemp();